From 5ccfa39ddd474fc2edc844539528d10f5f462b54 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Mon, 14 Dec 2020 11:12:05 +0530 Subject: [PATCH 001/506] leds: Use DEVICE_ATTR_{RW, RO, WO} macros Instead of open coding DEVICE_ATTR() defines, use the DEVICE_ATTR_RW(), DEVICE_ATTR_WO(), and DEVICE_ATTR_RO() macros. This required a few functions to be renamed, but the functionality itself is unchanged. Reviewed-by: Lukas Bulwahn Signed-off-by: Dwaipayan Ray Signed-off-by: Pavel Machek --- drivers/leds/leds-blinkm.c | 24 ++++++++++++------------ drivers/leds/leds-lm3530.c | 10 +++++----- drivers/leds/leds-lm355x.c | 8 ++++---- drivers/leds/leds-lm3642.c | 16 ++++++++-------- drivers/leds/leds-max8997.c | 12 ++++++------ drivers/leds/leds-netxbig.c | 12 ++++++------ drivers/leds/leds-ss4200.c | 12 ++++++------ drivers/leds/leds-wm831x-status.c | 12 ++++++------ 8 files changed, 53 insertions(+), 53 deletions(-) diff --git a/drivers/leds/leds-blinkm.c b/drivers/leds/leds-blinkm.c index e11fe1788242..b4e1fdff4186 100644 --- a/drivers/leds/leds-blinkm.c +++ b/drivers/leds/leds-blinkm.c @@ -192,13 +192,13 @@ static int store_color_common(struct device *dev, const char *buf, int color) return 0; } -static ssize_t show_red(struct device *dev, struct device_attribute *attr, +static ssize_t red_show(struct device *dev, struct device_attribute *attr, char *buf) { return show_color_common(dev, buf, RED); } -static ssize_t store_red(struct device *dev, struct device_attribute *attr, +static ssize_t red_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; @@ -209,15 +209,15 @@ static ssize_t store_red(struct device *dev, struct device_attribute *attr, return count; } -static DEVICE_ATTR(red, S_IRUGO | S_IWUSR, show_red, store_red); +static DEVICE_ATTR_RW(red); -static ssize_t show_green(struct device *dev, struct device_attribute *attr, +static ssize_t green_show(struct device *dev, struct device_attribute *attr, char *buf) { return show_color_common(dev, buf, GREEN); } -static ssize_t store_green(struct device *dev, struct device_attribute *attr, +static ssize_t green_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -229,15 +229,15 @@ static ssize_t store_green(struct device *dev, struct device_attribute *attr, return count; } -static DEVICE_ATTR(green, S_IRUGO | S_IWUSR, show_green, store_green); +static DEVICE_ATTR_RW(green); -static ssize_t show_blue(struct device *dev, struct device_attribute *attr, +static ssize_t blue_show(struct device *dev, struct device_attribute *attr, char *buf) { return show_color_common(dev, buf, BLUE); } -static ssize_t store_blue(struct device *dev, struct device_attribute *attr, +static ssize_t blue_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; @@ -248,16 +248,16 @@ static ssize_t store_blue(struct device *dev, struct device_attribute *attr, return count; } -static DEVICE_ATTR(blue, S_IRUGO | S_IWUSR, show_blue, store_blue); +static DEVICE_ATTR_RW(blue); -static ssize_t show_test(struct device *dev, struct device_attribute *attr, +static ssize_t test_show(struct device *dev, struct device_attribute *attr, char *buf) { return scnprintf(buf, PAGE_SIZE, "#Write into test to start test sequence!#\n"); } -static ssize_t store_test(struct device *dev, struct device_attribute *attr, +static ssize_t test_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -273,7 +273,7 @@ static ssize_t store_test(struct device *dev, struct device_attribute *attr, return count; } -static DEVICE_ATTR(test, S_IRUGO | S_IWUSR, show_test, store_test); +static DEVICE_ATTR_RW(test); /* TODO: HSB, fade, timeadj, script ... */ diff --git a/drivers/leds/leds-lm3530.c b/drivers/leds/leds-lm3530.c index 2f8362f6bf75..2db455efd4b1 100644 --- a/drivers/leds/leds-lm3530.c +++ b/drivers/leds/leds-lm3530.c @@ -346,8 +346,8 @@ static void lm3530_brightness_set(struct led_classdev *led_cdev, } } -static ssize_t lm3530_mode_get(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct lm3530_data *drvdata; @@ -365,8 +365,8 @@ static ssize_t lm3530_mode_get(struct device *dev, return len; } -static ssize_t lm3530_mode_set(struct device *dev, struct device_attribute - *attr, const char *buf, size_t size) +static ssize_t mode_store(struct device *dev, struct device_attribute + *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct lm3530_data *drvdata; @@ -397,7 +397,7 @@ static ssize_t lm3530_mode_set(struct device *dev, struct device_attribute return sizeof(drvdata->mode); } -static DEVICE_ATTR(mode, 0644, lm3530_mode_get, lm3530_mode_set); +static DEVICE_ATTR_RW(mode); static struct attribute *lm3530_attrs[] = { &dev_attr_mode.attr, diff --git a/drivers/leds/leds-lm355x.c b/drivers/leds/leds-lm355x.c index 1505521249b5..2d3e11845ba5 100644 --- a/drivers/leds/leds-lm355x.c +++ b/drivers/leds/leds-lm355x.c @@ -349,9 +349,9 @@ static int lm355x_indicator_brightness_set(struct led_classdev *cdev, } /* indicator pattern only for lm3556*/ -static ssize_t lm3556_indicator_pattern_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t pattern_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { ssize_t ret; struct led_classdev *led_cdev = dev_get_drvdata(dev); @@ -381,7 +381,7 @@ static ssize_t lm3556_indicator_pattern_store(struct device *dev, return ret; } -static DEVICE_ATTR(pattern, S_IWUSR, NULL, lm3556_indicator_pattern_store); +static DEVICE_ATTR_WO(pattern); static struct attribute *lm355x_indicator_attrs[] = { &dev_attr_pattern.attr, diff --git a/drivers/leds/leds-lm3642.c b/drivers/leds/leds-lm3642.c index 62c14872caf7..8007b82985a8 100644 --- a/drivers/leds/leds-lm3642.c +++ b/drivers/leds/leds-lm3642.c @@ -165,9 +165,9 @@ static int lm3642_control(struct lm3642_chip_data *chip, /* torch */ /* torch pin config for lm3642 */ -static ssize_t lm3642_torch_pin_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t torch_pin_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { ssize_t ret; struct led_classdev *led_cdev = dev_get_drvdata(dev); @@ -193,7 +193,7 @@ static ssize_t lm3642_torch_pin_store(struct device *dev, return size; } -static DEVICE_ATTR(torch_pin, S_IWUSR, NULL, lm3642_torch_pin_store); +static DEVICE_ATTR_WO(torch_pin); static int lm3642_torch_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) @@ -212,9 +212,9 @@ static int lm3642_torch_brightness_set(struct led_classdev *cdev, /* flash */ /* strobe pin config for lm3642*/ -static ssize_t lm3642_strobe_pin_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t strobe_pin_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { ssize_t ret; struct led_classdev *led_cdev = dev_get_drvdata(dev); @@ -240,7 +240,7 @@ static ssize_t lm3642_strobe_pin_store(struct device *dev, return size; } -static DEVICE_ATTR(strobe_pin, S_IWUSR, NULL, lm3642_strobe_pin_store); +static DEVICE_ATTR_WO(strobe_pin); static int lm3642_strobe_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) diff --git a/drivers/leds/leds-max8997.c b/drivers/leds/leds-max8997.c index 512a11d142d0..c0bddb33888d 100644 --- a/drivers/leds/leds-max8997.c +++ b/drivers/leds/leds-max8997.c @@ -160,8 +160,8 @@ static void max8997_led_brightness_set(struct led_classdev *led_cdev, } } -static ssize_t max8997_led_show_mode(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct max8997_led *led = @@ -193,9 +193,9 @@ static ssize_t max8997_led_show_mode(struct device *dev, return ret; } -static ssize_t max8997_led_store_mode(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t mode_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct max8997_led *led = @@ -222,7 +222,7 @@ static ssize_t max8997_led_store_mode(struct device *dev, return size; } -static DEVICE_ATTR(mode, 0644, max8997_led_show_mode, max8997_led_store_mode); +static DEVICE_ATTR_RW(mode); static struct attribute *max8997_attrs[] = { &dev_attr_mode.attr, diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index 68fbf0b66fad..77213b79f84d 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -204,9 +204,9 @@ static void netxbig_led_set(struct led_classdev *led_cdev, spin_unlock_irqrestore(&led_dat->lock, flags); } -static ssize_t netxbig_led_sata_store(struct device *dev, - struct device_attribute *attr, - const char *buff, size_t count) +static ssize_t sata_store(struct device *dev, + struct device_attribute *attr, + const char *buff, size_t count) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct netxbig_led_data *led_dat = @@ -255,8 +255,8 @@ static ssize_t netxbig_led_sata_store(struct device *dev, return ret; } -static ssize_t netxbig_led_sata_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t sata_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct netxbig_led_data *led_dat = @@ -265,7 +265,7 @@ static ssize_t netxbig_led_sata_show(struct device *dev, return sprintf(buf, "%d\n", led_dat->sata); } -static DEVICE_ATTR(sata, 0644, netxbig_led_sata_show, netxbig_led_sata_store); +static DEVICE_ATTR_RW(sata); static struct attribute *netxbig_led_attrs[] = { &dev_attr_sata.attr, diff --git a/drivers/leds/leds-ss4200.c b/drivers/leds/leds-ss4200.c index 245de443fe9c..a6992323d69d 100644 --- a/drivers/leds/leds-ss4200.c +++ b/drivers/leds/leds-ss4200.c @@ -441,8 +441,8 @@ static void set_power_light_amber_noblink(void) nasgpio_led_set_brightness(&amber->led_cdev, LED_FULL); } -static ssize_t nas_led_blink_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t blink_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct led_classdev *led = dev_get_drvdata(dev); int blinking = 0; @@ -451,9 +451,9 @@ static ssize_t nas_led_blink_show(struct device *dev, return sprintf(buf, "%u\n", blinking); } -static ssize_t nas_led_blink_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t blink_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { int ret; struct led_classdev *led = dev_get_drvdata(dev); @@ -468,7 +468,7 @@ static ssize_t nas_led_blink_store(struct device *dev, return size; } -static DEVICE_ATTR(blink, 0644, nas_led_blink_show, nas_led_blink_store); +static DEVICE_ATTR_RW(blink); static struct attribute *nasgpio_led_attrs[] = { &dev_attr_blink.attr, diff --git a/drivers/leds/leds-wm831x-status.c b/drivers/leds/leds-wm831x-status.c index 67f4235cb28a..c48b80574f02 100644 --- a/drivers/leds/leds-wm831x-status.c +++ b/drivers/leds/leds-wm831x-status.c @@ -155,8 +155,8 @@ static const char * const led_src_texts[] = { "soft", }; -static ssize_t wm831x_status_src_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t src_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct wm831x_status *led = to_wm831x_status(led_cdev); @@ -178,9 +178,9 @@ static ssize_t wm831x_status_src_show(struct device *dev, return ret; } -static ssize_t wm831x_status_src_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) +static ssize_t src_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct wm831x_status *led = to_wm831x_status(led_cdev); @@ -197,7 +197,7 @@ static ssize_t wm831x_status_src_store(struct device *dev, return size; } -static DEVICE_ATTR(src, 0644, wm831x_status_src_show, wm831x_status_src_store); +static DEVICE_ATTR_RW(src); static struct attribute *wm831x_status_attrs[] = { &dev_attr_src.attr, From 66898f3fe906364e36fddd5d6292d52022357cf4 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 14 Dec 2020 14:31:49 +0800 Subject: [PATCH 002/506] leds: ss4200: simplify the return expression of register_nasgpio_led() Simplify the return expression. Signed-off-by: Zheng Yongjun Signed-off-by: Pavel Machek --- drivers/leds/leds-ss4200.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/leds/leds-ss4200.c b/drivers/leds/leds-ss4200.c index a6992323d69d..fcaa34706b6c 100644 --- a/drivers/leds/leds-ss4200.c +++ b/drivers/leds/leds-ss4200.c @@ -478,7 +478,6 @@ ATTRIBUTE_GROUPS(nasgpio_led); static int register_nasgpio_led(int led_nr) { - int ret; struct nasgpio_led *nas_led = &nasgpio_leds[led_nr]; struct led_classdev *led = get_classdev_for_led_nr(led_nr); @@ -489,11 +488,8 @@ static int register_nasgpio_led(int led_nr) led->brightness_set = nasgpio_led_set_brightness; led->blink_set = nasgpio_led_set_blink; led->groups = nasgpio_led_groups; - ret = led_classdev_register(&nas_gpio_pci_dev->dev, led); - if (ret) - return ret; - return 0; + return led_classdev_register(&nas_gpio_pci_dev->dev, led); } static void unregister_nasgpio_led(int led_nr) From 4ce6b242b78d355ba899f1b16c6bfcd43df155a0 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 4 Jan 2021 09:59:30 +0800 Subject: [PATCH 003/506] pcmcia: Switch to using the new API kobj_to_dev() fixed the following coccicheck: drivers/pcmcia/cistpl.c:1557:54-55: WARNING opportunity for kobj_to_dev() drivers/pcmcia/cistpl.c:1584:53-54: WARNING opportunity for kobj_to_dev() Signed-off-by: Tian Tao Signed-off-by: Dominik Brodowski --- drivers/pcmcia/cistpl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index cf109d9a1112..e6939103991b 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -1554,7 +1554,7 @@ static ssize_t pccard_show_cis(struct file *filp, struct kobject *kobj, if (off + count > size) count = size - off; - s = to_socket(container_of(kobj, struct device, kobj)); + s = to_socket(kobj_to_dev(kobj)); if (!(s->state & SOCKET_PRESENT)) return -ENODEV; @@ -1581,7 +1581,7 @@ static ssize_t pccard_store_cis(struct file *filp, struct kobject *kobj, if (error) return error; - s = to_socket(container_of(kobj, struct device, kobj)); + s = to_socket(kobj_to_dev(kobj)); if (off) return -EINVAL; From 641e8cd2cbf0451e118fca33d5db10124dd87458 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 26 Nov 2020 10:40:38 +0800 Subject: [PATCH 004/506] riscv: Cleanup sbi function stubs when RISCV_SBI disabled Fix sbi_init() function declaration mismatch between RISCV_SBI enable and disable, as it always returned 0, make it void function. Drop some stubs which won't be used if RISCV_SBI disabled. Signed-off-by: Kefeng Wang Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 10 +++------- arch/riscv/kernel/sbi.c | 4 +--- arch/riscv/kernel/setup.c | 3 +-- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 653edb25d495..99dc77ba8e43 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -89,7 +89,7 @@ struct sbiret { long value; }; -int sbi_init(void); +void sbi_init(void); struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, @@ -147,11 +147,7 @@ static inline unsigned long sbi_minor_version(void) int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ -/* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */ -void sbi_set_timer(uint64_t stime_value); -void sbi_clear_ipi(void); -void sbi_send_ipi(const unsigned long *hart_mask); -void sbi_remote_fence_i(const unsigned long *hart_mask); -void sbi_init(void); +static inline void sbi_remote_fence_i(const unsigned long *hart_mask) {} +static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 226ccce0f9e0..bae6ffdc6e7b 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -560,7 +560,7 @@ static struct riscv_ipi_ops sbi_ipi_ops = { .ipi_inject = sbi_send_cpumask_ipi }; -int __init sbi_init(void) +void __init sbi_init(void) { int ret; @@ -600,6 +600,4 @@ int __init sbi_init(void) } riscv_set_ipi_ops(&sbi_ipi_ops); - - return 0; } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 1d85e9bf783c..a43a954ef529 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -251,8 +251,7 @@ void __init setup_arch(char **cmdline_p) pr_err("No DTB found in kernel mappings\n"); #endif - if (IS_ENABLED(CONFIG_RISCV_SBI)) - sbi_init(); + sbi_init(); if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) protect_kernel_text_data(); From 21733cb51847132567e0f5f568f126de08669842 Mon Sep 17 00:00:00 2001 From: Eric Lin Date: Fri, 4 Dec 2020 13:42:58 +0800 Subject: [PATCH 005/506] riscv/mm: Introduce a die_kernel_fault() helper function Like arm64, this patch adds a die_kernel_fault() helper to ensure the same semantics for the different kernel faults. Signed-off-by: Eric Lin Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 3c8b9e433c67..0d5f06d6e3c7 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -19,8 +19,23 @@ #include "../kernel/head.h" +static void die_kernel_fault(const char *msg, unsigned long addr, + struct pt_regs *regs) +{ + bust_spinlocks(1); + + pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", msg, + addr); + + bust_spinlocks(0); + die(regs, "Oops"); + do_exit(SIGKILL); +} + static inline void no_context(struct pt_regs *regs, unsigned long addr) { + const char *msg; + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; @@ -29,12 +44,8 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - die(regs, "Oops"); - do_exit(SIGKILL); + msg = (addr < PAGE_SIZE) ? "NULL pointer dereference" : "paging request"; + die_kernel_fault(msg, addr, regs); } static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) From 21855cac82d3264aa660deafa9c26b8eef548b7a Mon Sep 17 00:00:00 2001 From: Eric Lin Date: Fri, 4 Dec 2020 13:42:59 +0800 Subject: [PATCH 006/506] riscv/mm: Prevent kernel module to access user memory without uaccess routines We found this issue in an legacy out-of-tree kernel module which didn't properly access user space pointer by get/put_user(). Such an illegal access loops in the page fault handler. To resolve this, let it die here. Signed-off-by: Eric Lin Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 0d5f06d6e3c7..33d284188f9a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -243,6 +243,11 @@ asmlinkage void do_page_fault(struct pt_regs *regs) if (user_mode(regs)) flags |= FAULT_FLAG_USER; + if (!user_mode(regs) && addr < TASK_SIZE && + unlikely(!(regs->status & SR_SUM))) + die_kernel_fault("access to user memory without uaccess routines", + addr, regs); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (cause == EXC_STORE_PAGE_FAULT) From af951c3a113bc2cc0419e39f5752ca77f7ddf228 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Thu, 10 Dec 2020 15:58:02 +0530 Subject: [PATCH 007/506] dt-bindings: riscv: Update l2 cache DT documentation to add support for SiFive FU740 The L2 cache controller in SiFive FU740 has 4 ECC interrupt sources as compared to 3 in FU540. Update the DT documentation accordingly with "compatible" and "interrupt" property changes. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- .../bindings/riscv/sifive-l2-cache.yaml | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml b/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml index efc0198eeb74..6a576dce1f31 100644 --- a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml +++ b/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml @@ -27,6 +27,7 @@ select: items: - enum: - sifive,fu540-c000-ccache + - sifive,fu740-c000-ccache required: - compatible @@ -34,7 +35,9 @@ select: properties: compatible: items: - - const: sifive,fu540-c000-ccache + - enum: + - sifive,fu540-c000-ccache + - sifive,fu740-c000-ccache - const: cache cache-block-size: @@ -52,10 +55,13 @@ properties: cache-unified: true interrupts: - description: | - Must contain entries for DirError, DataError and DataFail signals. minItems: 3 - maxItems: 3 + maxItems: 4 + items: + - description: DirError interrupt + - description: DataError interrupt + - description: DataFail interrupt + - description: DirFail interrupt reg: maxItems: 1 @@ -67,6 +73,26 @@ properties: The reference to the reserved-memory for the L2 Loosely Integrated Memory region. The reserved memory node should be defined as per the bindings in reserved-memory.txt. +if: + properties: + compatible: + contains: + const: sifive,fu540-c000-ccache + +then: + properties: + interrupts: + description: | + Must contain entries for DirError, DataError and DataFail signals. + maxItems: 3 + +else: + properties: + interrupts: + description: | + Must contain entries for DirError, DataError, DataFail, DirFail signals. + minItems: 4 + additionalProperties: false required: From 507308b8ccc90d37b07bfca8ffe130435d6b354f Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Thu, 10 Dec 2020 15:58:03 +0530 Subject: [PATCH 008/506] RISC-V: sifive_l2_cache: Update L2 cache driver to support SiFive FU740 SiFive FU740 has 4 ECC interrupt sources as compared to 3 in FU540. Update the L2 cache controller driver to support this additional interrupt in case of FU740-C000 chip. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- drivers/soc/sifive/sifive_l2_cache.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/soc/sifive/sifive_l2_cache.c b/drivers/soc/sifive/sifive_l2_cache.c index 44d7e1951da3..59640a1d0b28 100644 --- a/drivers/soc/sifive/sifive_l2_cache.c +++ b/drivers/soc/sifive/sifive_l2_cache.c @@ -17,6 +17,10 @@ #define SIFIVE_L2_DIRECCFIX_HIGH 0x104 #define SIFIVE_L2_DIRECCFIX_COUNT 0x108 +#define SIFIVE_L2_DIRECCFAIL_LOW 0x120 +#define SIFIVE_L2_DIRECCFAIL_HIGH 0x124 +#define SIFIVE_L2_DIRECCFAIL_COUNT 0x128 + #define SIFIVE_L2_DATECCFIX_LOW 0x140 #define SIFIVE_L2_DATECCFIX_HIGH 0x144 #define SIFIVE_L2_DATECCFIX_COUNT 0x148 @@ -29,7 +33,7 @@ #define SIFIVE_L2_WAYENABLE 0x08 #define SIFIVE_L2_ECCINJECTERR 0x40 -#define SIFIVE_L2_MAX_ECCINTR 3 +#define SIFIVE_L2_MAX_ECCINTR 4 static void __iomem *l2_base; static int g_irq[SIFIVE_L2_MAX_ECCINTR]; @@ -39,6 +43,7 @@ enum { DIR_CORR = 0, DATA_CORR, DATA_UNCORR, + DIR_UNCORR, }; #ifdef CONFIG_DEBUG_FS @@ -93,6 +98,7 @@ static void l2_config_read(void) static const struct of_device_id sifive_l2_ids[] = { { .compatible = "sifive,fu540-c000-ccache" }, + { .compatible = "sifive,fu740-c000-ccache" }, { /* end of table */ }, }; @@ -155,6 +161,15 @@ static irqreturn_t l2_int_handler(int irq, void *device) atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE, "DirECCFix"); } + if (irq == g_irq[DIR_UNCORR]) { + add_h = readl(l2_base + SIFIVE_L2_DIRECCFAIL_HIGH); + add_l = readl(l2_base + SIFIVE_L2_DIRECCFAIL_LOW); + /* Reading this register clears the DirFail interrupt sig */ + readl(l2_base + SIFIVE_L2_DIRECCFAIL_COUNT); + atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_UE, + "DirECCFail"); + panic("L2CACHE: DirFail @ 0x%08X.%08X\n", add_h, add_l); + } if (irq == g_irq[DATA_CORR]) { add_h = readl(l2_base + SIFIVE_L2_DATECCFIX_HIGH); add_l = readl(l2_base + SIFIVE_L2_DATECCFIX_LOW); @@ -181,7 +196,7 @@ static int __init sifive_l2_init(void) { struct device_node *np; struct resource res; - int i, rc; + int i, rc, intr_num; np = of_find_matching_node(NULL, sifive_l2_ids); if (!np) @@ -194,7 +209,13 @@ static int __init sifive_l2_init(void) if (!l2_base) return -ENOMEM; - for (i = 0; i < SIFIVE_L2_MAX_ECCINTR; i++) { + intr_num = of_property_count_u32_elems(np, "interrupts"); + if (!intr_num) { + pr_err("L2CACHE: no interrupts property\n"); + return -ENODEV; + } + + for (i = 0; i < intr_num; i++) { g_irq[i] = irq_of_parse_and_map(np, i); rc = request_irq(g_irq[i], l2_int_handler, 0, "l2_ecc", NULL); if (rc) { From 75e6d7248efccc2b13d0f3811b29d3e5cb04bcad Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:33 +0530 Subject: [PATCH 009/506] dt-bindings: riscv: Update DT binding docs to support SiFive FU740 SoC Add new compatible strings in cpus.yaml to support the E71 and U74 CPU cores ("harts") that are present on FU740-C000 SoC. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index c6925e0b16e4..eb6843f69f7c 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -28,11 +28,17 @@ properties: - items: - enum: - sifive,rocket0 + - sifive,bullet0 - sifive,e5 + - sifive,e7 - sifive,e51 + - sifive,e71 - sifive,u54-mc + - sifive,u74-mc - sifive,u54 + - sifive,u74 - sifive,u5 + - sifive,u7 - const: riscv - const: riscv # Simulator only description: From b1f592d5c1e31dfa46a161c52740f3bff1b1e963 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:35 +0530 Subject: [PATCH 010/506] dt-bindings: pwm: Update DT binding docs to support SiFive FU740 SoC Add new compatible strings to the DT binding documents to support SiFive FU740-C000. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/pwm/pwm-sifive.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml b/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml index 5ac25275d8bf..84e66913d042 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml @@ -25,12 +25,15 @@ description: properties: compatible: items: - - const: sifive,fu540-c000-pwm + - enum: + - sifive,fu540-c000-pwm + - sifive,fu740-c000-pwm - const: sifive,pwm0 description: Should be "sifive,-pwm" and "sifive,pwm". Supported - compatible strings are "sifive,fu540-c000-pwm" for the SiFive PWM v0 - as integrated onto the SiFive FU540 chip, and "sifive,pwm0" for the + compatible strings are "sifive,fu540-c000-pwm" and + "sifive,fu740-c000-pwm" for the SiFive PWM v0 as integrated onto the + SiFive FU540 and FU740 chip respectively, and "sifive,pwm0" for the SiFive PWM v0 IP block with no chip integration tweaks. Please refer to sifive-blocks-ip-versioning.txt for details. From 42cf244c8f03cc8a801333fe2fa3b7efa760d119 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:37 +0530 Subject: [PATCH 011/506] dt-bindings: gpio: Update DT binding docs to support SiFive FU740 SoC Add new compatible strings to the DT binding documents to support SiFive FU740-C000. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/gpio/sifive,gpio.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml index a0efd8dc2538..ab22056f8b44 100644 --- a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml @@ -13,7 +13,9 @@ maintainers: properties: compatible: items: - - const: sifive,fu540-c000-gpio + - enum: + - sifive,fu540-c000-gpio + - sifive,fu740-c000-gpio - const: sifive,gpio0 reg: From 57985788158a5a6b77612e531b9d89bcad06e47c Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:39 +0530 Subject: [PATCH 012/506] riscv: dts: add initial support for the SiFive FU740-C000 SoC Add initial support for the SiFive FU540-C000 SoC. FU740-C000 is built around the SiFIve U7 Core Complex and a TileLink interconnect. This file is expected to grow as more device drivers are added to the kernel. Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/fu740-c000.dtsi | 293 +++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 arch/riscv/boot/dts/sifive/fu740-c000.dtsi diff --git a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi new file mode 100644 index 000000000000..eeb4f8c3e0e7 --- /dev/null +++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 SiFive, Inc */ + +/dts-v1/; + +#include + +/ { + #address-cells = <2>; + #size-cells = <2>; + compatible = "sifive,fu740-c000", "sifive,fu740"; + + aliases { + serial0 = &uart0; + serial1 = &uart1; + ethernet0 = ð0; + }; + + chosen { + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + cpu0: cpu@0 { + compatible = "sifive,bullet0", "riscv"; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <16384>; + next-level-cache = <&ccache>; + reg = <0x0>; + riscv,isa = "rv64imac"; + status = "disabled"; + cpu0_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + cpu1: cpu@1 { + compatible = "sifive,bullet0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <40>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <40>; + mmu-type = "riscv,sv39"; + next-level-cache = <&ccache>; + reg = <0x1>; + riscv,isa = "rv64imafdc"; + tlb-split; + cpu1_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + cpu2: cpu@2 { + compatible = "sifive,bullet0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <40>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <40>; + mmu-type = "riscv,sv39"; + next-level-cache = <&ccache>; + reg = <0x2>; + riscv,isa = "rv64imafdc"; + tlb-split; + cpu2_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + cpu3: cpu@3 { + compatible = "sifive,bullet0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <40>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <40>; + mmu-type = "riscv,sv39"; + next-level-cache = <&ccache>; + reg = <0x3>; + riscv,isa = "rv64imafdc"; + tlb-split; + cpu3_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + cpu4: cpu@4 { + compatible = "sifive,bullet0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <40>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <40>; + mmu-type = "riscv,sv39"; + next-level-cache = <&ccache>; + reg = <0x4>; + riscv,isa = "rv64imafdc"; + tlb-split; + cpu4_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + }; + soc { + #address-cells = <2>; + #size-cells = <2>; + compatible = "simple-bus"; + ranges; + plic0: interrupt-controller@c000000 { + #interrupt-cells = <1>; + #address-cells = <0>; + compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; + reg = <0x0 0xc000000 0x0 0x4000000>; + riscv,ndev = <69>; + interrupt-controller; + interrupts-extended = < + &cpu0_intc 0xffffffff + &cpu1_intc 0xffffffff &cpu1_intc 9 + &cpu2_intc 0xffffffff &cpu2_intc 9 + &cpu3_intc 0xffffffff &cpu3_intc 9 + &cpu4_intc 0xffffffff &cpu4_intc 9>; + }; + prci: clock-controller@10000000 { + compatible = "sifive,fu740-c000-prci"; + reg = <0x0 0x10000000 0x0 0x1000>; + clocks = <&hfclk>, <&rtcclk>; + #clock-cells = <1>; + }; + uart0: serial@10010000 { + compatible = "sifive,fu740-c000-uart", "sifive,uart0"; + reg = <0x0 0x10010000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <39>; + clocks = <&prci PRCI_CLK_PCLK>; + status = "disabled"; + }; + uart1: serial@10011000 { + compatible = "sifive,fu740-c000-uart", "sifive,uart0"; + reg = <0x0 0x10011000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <40>; + clocks = <&prci PRCI_CLK_PCLK>; + status = "disabled"; + }; + i2c0: i2c@10030000 { + compatible = "sifive,fu740-c000-i2c", "sifive,i2c0"; + reg = <0x0 0x10030000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <52>; + clocks = <&prci PRCI_CLK_PCLK>; + reg-shift = <2>; + reg-io-width = <1>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + i2c1: i2c@10031000 { + compatible = "sifive,fu740-c000-i2c", "sifive,i2c0"; + reg = <0x0 0x10031000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <53>; + clocks = <&prci PRCI_CLK_PCLK>; + reg-shift = <2>; + reg-io-width = <1>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + qspi0: spi@10040000 { + compatible = "sifive,fu740-c000-spi", "sifive,spi0"; + reg = <0x0 0x10040000 0x0 0x1000>, + <0x0 0x20000000 0x0 0x10000000>; + interrupt-parent = <&plic0>; + interrupts = <41>; + clocks = <&prci PRCI_CLK_PCLK>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + qspi1: spi@10041000 { + compatible = "sifive,fu740-c000-spi", "sifive,spi0"; + reg = <0x0 0x10041000 0x0 0x1000>, + <0x0 0x30000000 0x0 0x10000000>; + interrupt-parent = <&plic0>; + interrupts = <42>; + clocks = <&prci PRCI_CLK_PCLK>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + spi0: spi@10050000 { + compatible = "sifive,fu740-c000-spi", "sifive,spi0"; + reg = <0x0 0x10050000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <43>; + clocks = <&prci PRCI_CLK_PCLK>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + eth0: ethernet@10090000 { + compatible = "sifive,fu540-c000-gem"; + interrupt-parent = <&plic0>; + interrupts = <55>; + reg = <0x0 0x10090000 0x0 0x2000>, + <0x0 0x100a0000 0x0 0x1000>; + local-mac-address = [00 00 00 00 00 00]; + clock-names = "pclk", "hclk"; + clocks = <&prci PRCI_CLK_GEMGXLPLL>, + <&prci PRCI_CLK_GEMGXLPLL>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + pwm0: pwm@10020000 { + compatible = "sifive,fu740-c000-pwm", "sifive,pwm0"; + reg = <0x0 0x10020000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <44>, <45>, <46>, <47>; + clocks = <&prci PRCI_CLK_PCLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + pwm1: pwm@10021000 { + compatible = "sifive,fu740-c000-pwm", "sifive,pwm0"; + reg = <0x0 0x10021000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <48>, <49>, <50>, <51>; + clocks = <&prci PRCI_CLK_PCLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + ccache: cache-controller@2010000 { + compatible = "sifive,fu740-c000-ccache", "cache"; + cache-block-size = <64>; + cache-level = <2>; + cache-sets = <2048>; + cache-size = <2097152>; + cache-unified; + interrupt-parent = <&plic0>; + interrupts = <19 20 21 22>; + reg = <0x0 0x2010000 0x0 0x1000>; + }; + gpio: gpio@10060000 { + compatible = "sifive,fu740-c000-gpio", "sifive,gpio0"; + interrupt-parent = <&plic0>; + interrupts = <23>, <24>, <25>, <26>, <27>, <28>, <29>, + <30>, <31>, <32>, <33>, <34>, <35>, <36>, + <37>, <38>; + reg = <0x0 0x10060000 0x0 0x1000>; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + clocks = <&prci PRCI_CLK_PCLK>; + status = "disabled"; + }; + }; +}; From 3489c030102fee69ef423f71f279d7bfe547aed8 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:40 +0530 Subject: [PATCH 013/506] dt-bindings: riscv: Update YAML doc to support SiFive HiFive Unmatched board Add new compatible strings to the YAML DT binding document to support SiFive's HiFive Unmatched board Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- .../devicetree/bindings/riscv/sifive.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/sifive.yaml b/Documentation/devicetree/bindings/riscv/sifive.yaml index 3a8647d1da4c..ee0a239af4c2 100644 --- a/Documentation/devicetree/bindings/riscv/sifive.yaml +++ b/Documentation/devicetree/bindings/riscv/sifive.yaml @@ -17,11 +17,18 @@ properties: $nodename: const: '/' compatible: - items: - - enum: - - sifive,hifive-unleashed-a00 - - const: sifive,fu540-c000 - - const: sifive,fu540 + oneOf: + - items: + - enum: + - sifive,hifive-unleashed-a00 + - const: sifive,fu540-c000 + - const: sifive,fu540 + + - items: + - enum: + - sifive,hifive-unmatched-a00 + - const: sifive,fu740-c000 + - const: sifive,fu740 additionalProperties: true From d573b5558abb5281306ecb35b41a2814abd8d7e4 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 8 Dec 2020 10:25:41 +0530 Subject: [PATCH 014/506] riscv: dts: add initial board data for the SiFive HiFive Unmatched Add initial board data for the SiFive HiFive Unmatched A00. This patch is dependent on Zong's Patchset[0]. [0]: https://lore.kernel.org/linux-riscv/20201130082330.77268-4-zong.li@sifive.com/T/#u Signed-off-by: Yash Shah Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/Makefile | 3 +- .../boot/dts/sifive/hifive-unmatched-a00.dts | 253 ++++++++++++++++++ 2 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts diff --git a/arch/riscv/boot/dts/sifive/Makefile b/arch/riscv/boot/dts/sifive/Makefile index 6d6189e6e4af..74c47fe9fc22 100644 --- a/arch/riscv/boot/dts/sifive/Makefile +++ b/arch/riscv/boot/dts/sifive/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -dtb-$(CONFIG_SOC_SIFIVE) += hifive-unleashed-a00.dtb +dtb-$(CONFIG_SOC_SIFIVE) += hifive-unleashed-a00.dtb \ + hifive-unmatched-a00.dtb diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts new file mode 100644 index 000000000000..b1c3c596578f --- /dev/null +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 SiFive, Inc */ + +#include "fu740-c000.dtsi" +#include + +/* Clock frequency (in Hz) of the PCB crystal for rtcclk */ +#define RTCCLK_FREQ 1000000 + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "SiFive HiFive Unmatched A00"; + compatible = "sifive,hifive-unmatched-a00", "sifive,fu740-c000", + "sifive,fu740"; + + chosen { + stdout-path = "serial0"; + }; + + cpus { + timebase-frequency = ; + }; + + memory@80000000 { + device_type = "memory"; + reg = <0x0 0x80000000 0x2 0x00000000>; + }; + + soc { + }; + + hfclk: hfclk { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <26000000>; + clock-output-names = "hfclk"; + }; + + rtcclk: rtcclk { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = ; + clock-output-names = "rtcclk"; + }; +}; + +&uart0 { + status = "okay"; +}; + +&uart1 { + status = "okay"; +}; + +&i2c0 { + status = "okay"; + + temperature-sensor@4c { + compatible = "ti,tmp451"; + reg = <0x4c>; + interrupt-parent = <&gpio>; + interrupts = <6 IRQ_TYPE_LEVEL_LOW>; + }; + + pmic@58 { + compatible = "dlg,da9063"; + reg = <0x58>; + interrupt-parent = <&gpio>; + interrupts = <1 IRQ_TYPE_LEVEL_LOW>; + interrupt-controller; + + regulators { + vdd_bcore1: bcore1 { + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <900000>; + regulator-min-microamp = <5000000>; + regulator-max-microamp = <5000000>; + regulator-always-on; + }; + + vdd_bcore2: bcore2 { + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <900000>; + regulator-min-microamp = <5000000>; + regulator-max-microamp = <5000000>; + regulator-always-on; + }; + + vdd_bpro: bpro { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <2500000>; + regulator-max-microamp = <2500000>; + regulator-always-on; + }; + + vdd_bperi: bperi { + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + regulator-min-microamp = <1500000>; + regulator-max-microamp = <1500000>; + regulator-always-on; + }; + + vdd_bmem: bmem { + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + regulator-min-microamp = <3000000>; + regulator-max-microamp = <3000000>; + regulator-always-on; + }; + + vdd_bio: bio { + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + regulator-min-microamp = <3000000>; + regulator-max-microamp = <3000000>; + regulator-always-on; + }; + + vdd_ldo1: ldo1 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <100000>; + regulator-max-microamp = <100000>; + regulator-always-on; + }; + + vdd_ldo2: ldo2 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ldo3: ldo3 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ldo4: ldo4 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ldo5: ldo5 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <100000>; + regulator-max-microamp = <100000>; + regulator-always-on; + }; + + vdd_ldo6: ldo6 { + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ldo7: ldo7 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ldo8: ldo8 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + regulator-always-on; + }; + + vdd_ld09: ldo9 { + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + regulator-min-microamp = <200000>; + regulator-max-microamp = <200000>; + }; + + vdd_ldo10: ldo10 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <1000000>; + regulator-min-microamp = <300000>; + regulator-max-microamp = <300000>; + }; + + vdd_ldo11: ldo11 { + regulator-min-microvolt = <2500000>; + regulator-max-microvolt = <2500000>; + regulator-min-microamp = <300000>; + regulator-max-microamp = <300000>; + regulator-always-on; + }; + }; + }; +}; + +&qspi0 { + status = "okay"; + flash@0 { + compatible = "issi,is25wp256", "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <50000000>; + m25p,fast-read; + spi-tx-bus-width = <4>; + spi-rx-bus-width = <4>; + }; +}; + +&spi0 { + status = "okay"; + mmc@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + spi-max-frequency = <20000000>; + voltage-ranges = <3300 3300>; + disable-wp; + }; +}; + +ð0 { + status = "okay"; + phy-mode = "gmii"; + phy-handle = <&phy0>; + phy0: ethernet-phy@0 { + reg = <0>; + }; +}; + +&pwm0 { + status = "okay"; +}; + +&pwm1 { + status = "okay"; +}; + +&gpio { + status = "okay"; +}; From d5805af9fe9ffe4a9d975e9bc39496f57a161076 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:37 +0900 Subject: [PATCH 015/506] riscv: Fix builtin DTB handling All SiPeed K210 MAIX boards have the exact same vendor, arch and implementation IDs, preventing differentiation to select the correct device tree to use through the SOC_BUILTIN_DTB_DECLARE() macro. This result in this macro to be useless and mandates changing the code of the sysctl driver to change the builtin device tree suitable for the target board. Fix this problem by removing the SOC_BUILTIN_DTB_DECLARE() macro since it is used only for the K210 support. The code searching the builtin DTBs using the vendor, arch an implementation IDs is also removed. Support for builtin DTB falls back to the simpler and more traditional handling of builtin DTB using the CONFIG_BUILTIN_DTB option, similarly to other architectures. Signed-off-by: Damien Le Moal Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 19 ++++++++++---- arch/riscv/boot/dts/Makefile | 2 +- arch/riscv/boot/dts/kendryte/Makefile | 5 ++-- arch/riscv/include/asm/soc.h | 38 --------------------------- arch/riscv/kernel/soc.c | 27 ------------------- arch/riscv/mm/init.c | 6 +---- drivers/soc/kendryte/k210-sysctl.c | 12 --------- 7 files changed, 19 insertions(+), 90 deletions(-) diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 3284d5c291be..59000675cb9f 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -32,9 +32,7 @@ config SOC_KENDRYTE help This enables support for Kendryte K210 SoC platform hardware. -config SOC_KENDRYTE_K210_DTB - def_bool y - depends on SOC_KENDRYTE_K210_DTB_BUILTIN +if SOC_KENDRYTE config SOC_KENDRYTE_K210_DTB_BUILTIN bool "Builtin device tree for the Kendryte K210" @@ -42,10 +40,21 @@ config SOC_KENDRYTE_K210_DTB_BUILTIN default y select OF select BUILTIN_DTB - select SOC_KENDRYTE_K210_DTB help - Builds a device tree for the Kendryte K210 into the Linux image. + Build a device tree for the Kendryte K210 into the Linux image. This option should be selected if no bootloader is being used. If unsure, say Y. +config SOC_KENDRYTE_K210_DTB_SOURCE + string "Source file for the Kendryte K210 builtin DTB" + depends on SOC_KENDRYTE + depends on SOC_KENDRYTE_K210_DTB_BUILTIN + default "k210" + help + Base name (without suffix, relative to arch/riscv/boot/dts/kendryte) + for the DTS file that will be used to produce the DTB linked into the + kernel. + +endif + endmenu diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile index ca1f8cbd78c0..21e3905f1c44 100644 --- a/arch/riscv/boot/dts/Makefile +++ b/arch/riscv/boot/dts/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 subdir-y += sifive -subdir-y += kendryte +subdir-$(CONFIG_SOC_KENDRYTE) += kendryte obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y)) diff --git a/arch/riscv/boot/dts/kendryte/Makefile b/arch/riscv/boot/dts/kendryte/Makefile index 1a88e616f18e..83636693166d 100644 --- a/arch/riscv/boot/dts/kendryte/Makefile +++ b/arch/riscv/boot/dts/kendryte/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -dtb-$(CONFIG_SOC_KENDRYTE_K210_DTB) += k210.dtb - +ifneq ($(CONFIG_SOC_KENDRYTE_K210_DTB_SOURCE),"") +dtb-y += $(strip $(shell echo $(CONFIG_SOC_KENDRYTE_K210_DTB_SOURCE))).dtb obj-$(CONFIG_SOC_KENDRYTE_K210_DTB_BUILTIN) += $(addsuffix .o, $(dtb-y)) +endif diff --git a/arch/riscv/include/asm/soc.h b/arch/riscv/include/asm/soc.h index 6c8363b1f327..f494066051a2 100644 --- a/arch/riscv/include/asm/soc.h +++ b/arch/riscv/include/asm/soc.h @@ -21,42 +21,4 @@ void soc_early_init(void); extern unsigned long __soc_early_init_table_start; extern unsigned long __soc_early_init_table_end; -/* - * Allows Linux to provide a device tree, which is necessary for SOCs that - * don't provide a useful one on their own. - */ -struct soc_builtin_dtb { - unsigned long vendor_id; - unsigned long arch_id; - unsigned long imp_id; - void *(*dtb_func)(void); -}; - -/* - * The argument name must specify a valid DTS file name without the dts - * extension. - */ -#define SOC_BUILTIN_DTB_DECLARE(name, vendor, arch, impl) \ - extern void *__dtb_##name##_begin; \ - \ - static __init __used \ - void *__soc_builtin_dtb_f__##name(void) \ - { \ - return (void *)&__dtb_##name##_begin; \ - } \ - \ - static const struct soc_builtin_dtb __soc_builtin_dtb__##name \ - __used __section("__soc_builtin_dtb_table") = \ - { \ - .vendor_id = vendor, \ - .arch_id = arch, \ - .imp_id = impl, \ - .dtb_func = __soc_builtin_dtb_f__##name, \ - } - -extern unsigned long __soc_builtin_dtb_table_start; -extern unsigned long __soc_builtin_dtb_table_end; - -void *soc_lookup_builtin_dtb(void); - #endif diff --git a/arch/riscv/kernel/soc.c b/arch/riscv/kernel/soc.c index c7b0a73e382e..a0516172a33c 100644 --- a/arch/riscv/kernel/soc.c +++ b/arch/riscv/kernel/soc.c @@ -26,30 +26,3 @@ void __init soc_early_init(void) } } } - -static bool soc_builtin_dtb_match(unsigned long vendor_id, - unsigned long arch_id, unsigned long imp_id, - const struct soc_builtin_dtb *entry) -{ - return entry->vendor_id == vendor_id && - entry->arch_id == arch_id && - entry->imp_id == imp_id; -} - -void * __init soc_lookup_builtin_dtb(void) -{ - unsigned long vendor_id, arch_id, imp_id; - const struct soc_builtin_dtb *s; - - __asm__ ("csrr %0, mvendorid" : "=r"(vendor_id)); - __asm__ ("csrr %0, marchid" : "=r"(arch_id)); - __asm__ ("csrr %0, mimpid" : "=r"(imp_id)); - - for (s = (void *)&__soc_builtin_dtb_table_start; - (void *)s < (void *)&__soc_builtin_dtb_table_end; s++) { - if (soc_builtin_dtb_match(vendor_id, arch_id, imp_id, s)) - return s->dtb_func(); - } - - return NULL; -} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bf5379135e39..77bd23f47a72 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -605,11 +605,7 @@ static void __init setup_vm_final(void) asmlinkage void __init setup_vm(uintptr_t dtb_pa) { #ifdef CONFIG_BUILTIN_DTB - dtb_early_va = soc_lookup_builtin_dtb(); - if (!dtb_early_va) { - /* Fallback to first available DTS */ - dtb_early_va = (void *) __dtb_start; - } + dtb_early_va = (void *) __dtb_start; #else dtb_early_va = (void *)dtb_pa; #endif diff --git a/drivers/soc/kendryte/k210-sysctl.c b/drivers/soc/kendryte/k210-sysctl.c index 707019223dd8..4608fbca20e1 100644 --- a/drivers/soc/kendryte/k210-sysctl.c +++ b/drivers/soc/kendryte/k210-sysctl.c @@ -246,15 +246,3 @@ static void __init k210_soc_early_init(const void *fdt) iounmap(regs); } SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init); - -#ifdef CONFIG_SOC_KENDRYTE_K210_DTB_BUILTIN -/* - * Generic entry for the default k210.dtb embedded DTB for boards with: - * - Vendor ID: 0x4B5 - * - Arch ID: 0xE59889E6A5A04149 (= "Canaan AI" in UTF-8 encoded Chinese) - * - Impl ID: 0x4D41495832303030 (= "MAIX2000") - * These values are reported by the SiPEED MAXDUINO, SiPEED MAIX GO and - * SiPEED Dan dock boards. - */ -SOC_BUILTIN_DTB_DECLARE(k210, 0x4B5, 0xE59889E6A5A04149, 0x4D41495832303030); -#endif From 08734e0581a54df77f1af354b93b02ac581e4fbb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:38 +0900 Subject: [PATCH 016/506] riscv: Use vendor name for K210 SoC support Rename configuration options and directories related to the Kendryte K210 SoC to use the SoC vendor name (canaan) instead of the "kendryte" branding name. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 24 +++++++++---------- arch/riscv/Makefile | 2 +- arch/riscv/boot/dts/Makefile | 2 +- arch/riscv/boot/dts/canaan/Makefile | 5 ++++ .../boot/dts/{kendryte => canaan}/k210.dts | 0 .../boot/dts/{kendryte => canaan}/k210.dtsi | 0 arch/riscv/boot/dts/kendryte/Makefile | 5 ---- arch/riscv/configs/nommu_k210_defconfig | 2 +- drivers/soc/Kconfig | 2 +- drivers/soc/Makefile | 2 +- drivers/soc/{kendryte => canaan}/Kconfig | 4 ++-- drivers/soc/{kendryte => canaan}/Makefile | 0 .../soc/{kendryte => canaan}/k210-sysctl.c | 0 13 files changed, 24 insertions(+), 24 deletions(-) create mode 100644 arch/riscv/boot/dts/canaan/Makefile rename arch/riscv/boot/dts/{kendryte => canaan}/k210.dts (100%) rename arch/riscv/boot/dts/{kendryte => canaan}/k210.dtsi (100%) delete mode 100644 arch/riscv/boot/dts/kendryte/Makefile rename drivers/soc/{kendryte => canaan}/Kconfig (79%) rename drivers/soc/{kendryte => canaan}/Makefile (100%) rename drivers/soc/{kendryte => canaan}/k210-sysctl.c (100%) diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 59000675cb9f..027042e4d0af 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -22,21 +22,21 @@ config SOC_VIRT help This enables support for QEMU Virt Machine. -config SOC_KENDRYTE - bool "Kendryte K210 SoC" +config SOC_CANAAN + bool "Canaan Kendryte K210 SoC" depends on !MMU select CLINT_TIMER if RISCV_M_MODE select SERIAL_SIFIVE if TTY select SERIAL_SIFIVE_CONSOLE if TTY select SIFIVE_PLIC help - This enables support for Kendryte K210 SoC platform hardware. + This enables support for Canaan Kendryte K210 SoC platform hardware. -if SOC_KENDRYTE +if SOC_CANAAN -config SOC_KENDRYTE_K210_DTB_BUILTIN - bool "Builtin device tree for the Kendryte K210" - depends on SOC_KENDRYTE +config SOC_CANAAN_K210_DTB_BUILTIN + bool "Builtin device tree for the Canaan Kendryte K210" + depends on SOC_CANAAN default y select OF select BUILTIN_DTB @@ -45,13 +45,13 @@ config SOC_KENDRYTE_K210_DTB_BUILTIN This option should be selected if no bootloader is being used. If unsure, say Y. -config SOC_KENDRYTE_K210_DTB_SOURCE - string "Source file for the Kendryte K210 builtin DTB" - depends on SOC_KENDRYTE - depends on SOC_KENDRYTE_K210_DTB_BUILTIN +config SOC_CANAAN_K210_DTB_SOURCE + string "Source file for the Canaan Kendryte K210 builtin DTB" + depends on SOC_CANAAN + depends on SOC_CANAAN_K210_DTB_BUILTIN default "k210" help - Base name (without suffix, relative to arch/riscv/boot/dts/kendryte) + Base name (without suffix, relative to arch/riscv/boot/dts/canaan) for the DTS file that will be used to produce the DTB linked into the kernel. diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 8c29e553ef7f..597b58b2d4c0 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -83,7 +83,7 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ -ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_KENDRYTE),yy) +ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy) KBUILD_IMAGE := $(boot)/loader.bin else KBUILD_IMAGE := $(boot)/Image.gz diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile index 21e3905f1c44..7ffd502e3e7b 100644 --- a/arch/riscv/boot/dts/Makefile +++ b/arch/riscv/boot/dts/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 subdir-y += sifive -subdir-$(CONFIG_SOC_KENDRYTE) += kendryte +subdir-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += canaan obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y)) diff --git a/arch/riscv/boot/dts/canaan/Makefile b/arch/riscv/boot/dts/canaan/Makefile new file mode 100644 index 000000000000..9ee7156c0c31 --- /dev/null +++ b/arch/riscv/boot/dts/canaan/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +ifneq ($(CONFIG_SOC_CANAAN_K210_DTB_SOURCE),"") +dtb-y += $(strip $(shell echo $(CONFIG_SOC_CANAAN_K210_DTB_SOURCE))).dtb +obj-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += $(addsuffix .o, $(dtb-y)) +endif diff --git a/arch/riscv/boot/dts/kendryte/k210.dts b/arch/riscv/boot/dts/canaan/k210.dts similarity index 100% rename from arch/riscv/boot/dts/kendryte/k210.dts rename to arch/riscv/boot/dts/canaan/k210.dts diff --git a/arch/riscv/boot/dts/kendryte/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi similarity index 100% rename from arch/riscv/boot/dts/kendryte/k210.dtsi rename to arch/riscv/boot/dts/canaan/k210.dtsi diff --git a/arch/riscv/boot/dts/kendryte/Makefile b/arch/riscv/boot/dts/kendryte/Makefile deleted file mode 100644 index 83636693166d..000000000000 --- a/arch/riscv/boot/dts/kendryte/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -ifneq ($(CONFIG_SOC_KENDRYTE_K210_DTB_SOURCE),"") -dtb-y += $(strip $(shell echo $(CONFIG_SOC_KENDRYTE_K210_DTB_SOURCE))).dtb -obj-$(CONFIG_SOC_KENDRYTE_K210_DTB_BUILTIN) += $(addsuffix .o, $(dtb-y)) -endif diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index cd1df62b13c7..368a28cf1467 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -27,7 +27,7 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set -CONFIG_SOC_KENDRYTE=y +CONFIG_SOC_CANAAN=y CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig index d097d070f579..c0143651f4a9 100644 --- a/drivers/soc/Kconfig +++ b/drivers/soc/Kconfig @@ -6,6 +6,7 @@ source "drivers/soc/amlogic/Kconfig" source "drivers/soc/aspeed/Kconfig" source "drivers/soc/atmel/Kconfig" source "drivers/soc/bcm/Kconfig" +source "drivers/soc/canaan/Kconfig" source "drivers/soc/fsl/Kconfig" source "drivers/soc/imx/Kconfig" source "drivers/soc/ixp4xx/Kconfig" @@ -23,6 +24,5 @@ source "drivers/soc/ux500/Kconfig" source "drivers/soc/versatile/Kconfig" source "drivers/soc/xilinx/Kconfig" source "drivers/soc/zte/Kconfig" -source "drivers/soc/kendryte/Kconfig" endmenu diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile index 699b758d28e4..fa7071246546 100644 --- a/drivers/soc/Makefile +++ b/drivers/soc/Makefile @@ -29,4 +29,4 @@ obj-$(CONFIG_ARCH_U8500) += ux500/ obj-$(CONFIG_PLAT_VERSATILE) += versatile/ obj-y += xilinx/ obj-$(CONFIG_ARCH_ZX) += zte/ -obj-$(CONFIG_SOC_KENDRYTE) += kendryte/ +obj-$(CONFIG_SOC_CANAAN) += canaan/ diff --git a/drivers/soc/kendryte/Kconfig b/drivers/soc/canaan/Kconfig similarity index 79% rename from drivers/soc/kendryte/Kconfig rename to drivers/soc/canaan/Kconfig index 49785b1b0217..5232d13f07e5 100644 --- a/drivers/soc/kendryte/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -if SOC_KENDRYTE +if SOC_CANAAN config K210_SYSCTL - bool "Kendryte K210 system controller" + bool "Canaan Kendryte K210 SoC system controller" default y depends on RISCV help diff --git a/drivers/soc/kendryte/Makefile b/drivers/soc/canaan/Makefile similarity index 100% rename from drivers/soc/kendryte/Makefile rename to drivers/soc/canaan/Makefile diff --git a/drivers/soc/kendryte/k210-sysctl.c b/drivers/soc/canaan/k210-sysctl.c similarity index 100% rename from drivers/soc/kendryte/k210-sysctl.c rename to drivers/soc/canaan/k210-sysctl.c From 93c2ce1ee77e2db5d99f00bb7f79c34ab8439f17 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:39 +0900 Subject: [PATCH 017/506] riscv: Fix Canaan Kendryte K210 device tree Remove the clocks property from the cpu and clint nodes as these are ignored. Also remove the clock-frequency property from the cpu nodes as riscv relies on the timebase-frequency property. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/canaan/k210.dtsi | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index d2d0ff645632..354b263195a3 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -38,8 +38,6 @@ cpu0: cpu@0 { i-cache-block-size = <64>; d-cache-size = <0x8000>; d-cache-block-size = <64>; - clocks = <&sysctl K210_CLK_CPU>; - clock-frequency = <390000000>; cpu0_intc: interrupt-controller { #interrupt-cells = <1>; interrupt-controller; @@ -56,8 +54,6 @@ cpu1: cpu@1 { i-cache-block-size = <64>; d-cache-size = <0x8000>; d-cache-block-size = <64>; - clocks = <&sysctl K210_CLK_CPU>; - clock-frequency = <390000000>; cpu1_intc: interrupt-controller { #interrupt-cells = <1>; interrupt-controller; @@ -101,7 +97,6 @@ clint0: clint@2000000 { reg = <0x2000000 0xC000>; interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 &cpu1_intc 3 &cpu1_intc 7>; - clocks = <&sysctl K210_CLK_ACLK>; }; plic0: interrupt-controller@c000000 { From 802fee26d8afd073c630a74dbe1a996970f3fd90 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:40 +0900 Subject: [PATCH 018/506] riscv: cleanup Canaan Kendryte K210 sysctl driver Introduce the header file include/soc/canaan/k210-sysctl.h to have a common definition of the Canaan Kendryte K210 SoC system controller registers. Simplify the k210 system controller driver code by removing unused register bits definition. The MAINTAINERS file is updated, adding the entry "CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER" with myself listed as maintainer for this driver. This is a preparatory patch for introducing the K210 clock driver. No functional changes are introduced. Signed-off-by: Damien Le Moal Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 7 +++ drivers/soc/canaan/k210-sysctl.c | 93 +++++++++----------------------- include/soc/canaan/k210-sysctl.h | 41 ++++++++++++++ 3 files changed, 74 insertions(+), 67 deletions(-) create mode 100644 include/soc/canaan/k210-sysctl.h diff --git a/MAINTAINERS b/MAINTAINERS index 6eff4f720c72..4aaa7a17016b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3860,6 +3860,13 @@ W: https://github.com/Cascoda/ca8210-linux.git F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c +CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER +M: Damien Le Moal +L: linux-riscv@lists.infradead.org +S: Maintained +F: drivers/soc/canaan/ +F: include/soc/canaan/ + CACHEFILES: FS-CACHE BACKEND FOR CACHING ON MOUNTED FILESYSTEMS M: David Howells L: linux-cachefs@redhat.com (moderated for non-subscribers) diff --git a/drivers/soc/canaan/k210-sysctl.c b/drivers/soc/canaan/k210-sysctl.c index 4608fbca20e1..60b474c33d45 100644 --- a/drivers/soc/canaan/k210-sysctl.c +++ b/drivers/soc/canaan/k210-sysctl.c @@ -12,74 +12,33 @@ #include #include +#include + #define K210_SYSCTL_CLK0_FREQ 26000000UL /* Registers base address */ #define K210_SYSCTL_SYSCTL_BASE_ADDR 0x50440000ULL -/* Registers */ -#define K210_SYSCTL_PLL0 0x08 -#define K210_SYSCTL_PLL1 0x0c -/* clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */ -#define PLL_RESET (1 << 20) -#define PLL_PWR (1 << 21) -#define PLL_INTFB (1 << 22) -#define PLL_BYPASS (1 << 23) -#define PLL_TEST (1 << 24) -#define PLL_OUT_EN (1 << 25) -#define PLL_TEST_EN (1 << 26) -#define K210_SYSCTL_PLL_LOCK 0x18 -#define PLL0_LOCK1 (1 << 0) -#define PLL0_LOCK2 (1 << 1) -#define PLL0_SLIP_CLEAR (1 << 2) -#define PLL0_TEST_CLK_OUT (1 << 3) -#define PLL1_LOCK1 (1 << 8) -#define PLL1_LOCK2 (1 << 9) -#define PLL1_SLIP_CLEAR (1 << 10) -#define PLL1_TEST_CLK_OUT (1 << 11) -#define PLL2_LOCK1 (1 << 16) -#define PLL2_LOCK2 (1 << 16) -#define PLL2_SLIP_CLEAR (1 << 18) -#define PLL2_TEST_CLK_OUT (1 << 19) -#define K210_SYSCTL_CLKSEL0 0x20 -#define CLKSEL_ACLK (1 << 0) -#define K210_SYSCTL_CLKEN_CENT 0x28 -#define CLKEN_CPU (1 << 0) -#define CLKEN_SRAM0 (1 << 1) -#define CLKEN_SRAM1 (1 << 2) -#define CLKEN_APB0 (1 << 3) -#define CLKEN_APB1 (1 << 4) -#define CLKEN_APB2 (1 << 5) -#define K210_SYSCTL_CLKEN_PERI 0x2c -#define CLKEN_ROM (1 << 0) -#define CLKEN_DMA (1 << 1) -#define CLKEN_AI (1 << 2) -#define CLKEN_DVP (1 << 3) -#define CLKEN_FFT (1 << 4) -#define CLKEN_GPIO (1 << 5) -#define CLKEN_SPI0 (1 << 6) -#define CLKEN_SPI1 (1 << 7) -#define CLKEN_SPI2 (1 << 8) -#define CLKEN_SPI3 (1 << 9) -#define CLKEN_I2S0 (1 << 10) -#define CLKEN_I2S1 (1 << 11) -#define CLKEN_I2S2 (1 << 12) -#define CLKEN_I2C0 (1 << 13) -#define CLKEN_I2C1 (1 << 14) -#define CLKEN_I2C2 (1 << 15) -#define CLKEN_UART1 (1 << 16) -#define CLKEN_UART2 (1 << 17) -#define CLKEN_UART3 (1 << 18) -#define CLKEN_AES (1 << 19) -#define CLKEN_FPIO (1 << 20) -#define CLKEN_TIMER0 (1 << 21) -#define CLKEN_TIMER1 (1 << 22) -#define CLKEN_TIMER2 (1 << 23) -#define CLKEN_WDT0 (1 << 24) -#define CLKEN_WDT1 (1 << 25) -#define CLKEN_SHA (1 << 26) -#define CLKEN_OTP (1 << 27) -#define CLKEN_RTC (1 << 29) +/* Register bits */ +/* K210_SYSCTL_PLL1: clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */ +#define PLL_RESET (1 << 20) +#define PLL_PWR (1 << 21) +#define PLL_BYPASS (1 << 23) +#define PLL_OUT_EN (1 << 25) +/* K210_SYSCTL_PLL_LOCK */ +#define PLL1_LOCK1 (1 << 8) +#define PLL1_LOCK2 (1 << 9) +#define PLL1_SLIP_CLEAR (1 << 10) +/* K210_SYSCTL_SEL0 */ +#define CLKSEL_ACLK (1 << 0) +/* K210_SYSCTL_CLKEN_CENT */ +#define CLKEN_CPU (1 << 0) +#define CLKEN_SRAM0 (1 << 1) +#define CLKEN_SRAM1 (1 << 2) +/* K210_SYSCTL_EN_PERI */ +#define CLKEN_ROM (1 << 0) +#define CLKEN_TIMER0 (1 << 21) +#define CLKEN_RTC (1 << 29) struct k210_sysctl { void __iomem *regs; @@ -140,7 +99,7 @@ static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw, * If the clock selector is not set, use the base frequency. * Otherwise, use PLL0 frequency with a frequency divisor. */ - clksel0 = readl(s->regs + K210_SYSCTL_CLKSEL0); + clksel0 = readl(s->regs + K210_SYSCTL_SEL0); if (!(clksel0 & CLKSEL_ACLK)) return K210_SYSCTL_CLK0_FREQ; @@ -237,11 +196,11 @@ static void __init k210_soc_early_init(const void *fdt) k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0); k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1, - regs + K210_SYSCTL_CLKEN_CENT); + regs + K210_SYSCTL_EN_CENT); k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC, - regs + K210_SYSCTL_CLKEN_PERI); + regs + K210_SYSCTL_EN_PERI); - k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_CLKSEL0); + k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_SEL0); iounmap(regs); } diff --git a/include/soc/canaan/k210-sysctl.h b/include/soc/canaan/k210-sysctl.h new file mode 100644 index 000000000000..2e037db68f35 --- /dev/null +++ b/include/soc/canaan/k210-sysctl.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef K210_SYSCTL_H +#define K210_SYSCTL_H + +/* + * Kendryte K210 SoC system controller registers offsets. + * Taken from Kendryte SDK (kendryte-standalone-sdk). + */ +#define K210_SYSCTL_GIT_ID 0x00 /* Git short commit id */ +#define K210_SYSCTL_UART_BAUD 0x04 /* Default UARTHS baud rate */ +#define K210_SYSCTL_PLL0 0x08 /* PLL0 controller */ +#define K210_SYSCTL_PLL1 0x0C /* PLL1 controller */ +#define K210_SYSCTL_PLL2 0x10 /* PLL2 controller */ +#define K210_SYSCTL_PLL_LOCK 0x18 /* PLL lock tester */ +#define K210_SYSCTL_ROM_ERROR 0x1C /* AXI ROM detector */ +#define K210_SYSCTL_SEL0 0x20 /* Clock select controller 0 */ +#define K210_SYSCTL_SEL1 0x24 /* Clock select controller 1 */ +#define K210_SYSCTL_EN_CENT 0x28 /* Central clock enable */ +#define K210_SYSCTL_EN_PERI 0x2C /* Peripheral clock enable */ +#define K210_SYSCTL_SOFT_RESET 0x30 /* Soft reset ctrl */ +#define K210_SYSCTL_PERI_RESET 0x34 /* Peripheral reset controller */ +#define K210_SYSCTL_THR0 0x38 /* Clock threshold controller 0 */ +#define K210_SYSCTL_THR1 0x3C /* Clock threshold controller 1 */ +#define K210_SYSCTL_THR2 0x40 /* Clock threshold controller 2 */ +#define K210_SYSCTL_THR3 0x44 /* Clock threshold controller 3 */ +#define K210_SYSCTL_THR4 0x48 /* Clock threshold controller 4 */ +#define K210_SYSCTL_THR5 0x4C /* Clock threshold controller 5 */ +#define K210_SYSCTL_THR6 0x50 /* Clock threshold controller 6 */ +#define K210_SYSCTL_MISC 0x54 /* Miscellaneous controller */ +#define K210_SYSCTL_PERI 0x58 /* Peripheral controller */ +#define K210_SYSCTL_SPI_SLEEP 0x5C /* SPI sleep controller */ +#define K210_SYSCTL_RESET_STAT 0x60 /* Reset source status */ +#define K210_SYSCTL_DMA_SEL0 0x64 /* DMA handshake selector 0 */ +#define K210_SYSCTL_DMA_SEL1 0x68 /* DMA handshake selector 1 */ +#define K210_SYSCTL_POWER_SEL 0x6C /* IO Power Mode Select controller */ + +#endif From 23fb08e72a0e3cb0509337e3322a7b57d73fbb82 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:45 +0900 Subject: [PATCH 019/506] dt-binding: mfd: Document canaan,k210-sysctl bindings Document the device tree bindings of the Canaan Kendryte K210 SoC system controller driver in Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml. Signed-off-by: Damien Le Moal Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../bindings/mfd/canaan,k210-sysctl.yaml | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml diff --git a/Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml b/Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml new file mode 100644 index 000000000000..c24ad45cabb5 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/canaan,k210-sysctl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Canaan Kendryte K210 System Controller Device Tree Bindings + +maintainers: + - Damien Le Moal + +description: + Canaan Inc. Kendryte K210 SoC system controller which provides a + register map for controlling the clocks, reset signals and pin power + domains of the SoC. + +properties: + compatible: + items: + - const: canaan,k210-sysctl + - const: syscon + - const: simple-mfd + + clocks: + maxItems: 1 + description: + System controller Advanced Power Bus (APB) interface clock source. + + clock-names: + items: + - const: pclk + + reg: + maxItems: 1 + + clock-controller: + # Child node + type: object + $ref: "../clock/canaan,k210-clk.yaml" + description: + Clock controller for the SoC clocks. This child node definition + should follow the bindings specified in + Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml. + + reset-controller: + # Child node + type: object + $ref: "../reset/canaan,k210-rst.yaml" + description: + Reset controller for the SoC. This child node definition + should follow the bindings specified in + Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml. + + syscon-reboot: + # Child node + type: object + $ref: "../power/reset/syscon-reboot.yaml" + description: + Reboot method for the SoC. This child node definition + should follow the bindings specified in + Documentation/devicetree/bindings/power/reset/syscon-reboot.yaml. + +required: + - compatible + - clocks + - reg + - clock-controller + +additionalProperties: false + +examples: + - | + #include + #include + + clocks { + in0: oscllator { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <26000000>; + }; + }; + + sysctl: syscon@50440000 { + compatible = "canaan,k210-sysctl", + "syscon", "simple-mfd"; + reg = <0x50440000 0x100>; + clocks = <&sysclk K210_CLK_APB1>; + clock-names = "pclk"; + + sysclk: clock-controller { + #clock-cells = <1>; + compatible = "canaan,k210-clk"; + clocks = <&in0>; + }; + + sysrst: reset-controller { + compatible = "canaan,k210-rst"; + #reset-cells = <1>; + }; + + reboot: syscon-reboot { + compatible = "syscon-reboot"; + regmap = <&sysctl>; + offset = <48>; + mask = <1>; + value = <1>; + }; + }; From 1d7c9d093ed58b8cf4ae23986cd01272667d412a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:43 +0900 Subject: [PATCH 020/506] dt-bindings: reset: Document canaan,k210-rst bindings Document the device tree bindings for the Canaan Kendryte K210 SoC reset controller driver in Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml. The header file include/dt-bindings/reset/k210-rst.h is added to define all possible reset lines of the SoC. Signed-off-by: Damien Le Moal Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../bindings/reset/canaan,k210-rst.yaml | 40 ++++++++++++++++++ include/dt-bindings/reset/k210-rst.h | 42 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml create mode 100644 include/dt-bindings/reset/k210-rst.h diff --git a/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml b/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml new file mode 100644 index 000000000000..53e4ede9c0bd --- /dev/null +++ b/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/reset/canaan,k210-rst.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Canaan Kendryte K210 Reset Controller Device Tree Bindings + +maintainers: + - Damien Le Moal + +description: | + Canaan Kendryte K210 reset controller driver which supports the SoC + system controller supplied reset registers for the various peripherals + of the SoC. The K210 reset controller node must be defined as a child + node of the K210 system controller node. + + See also: + - dt-bindings/reset/k210-rst.h + +properties: + compatible: + const: canaan,k210-rst + + '#reset-cells': + const: 1 + +required: + - '#reset-cells' + - compatible + +additionalProperties: false + +examples: + - | + #include + sysrst: reset-controller { + compatible = "canaan,k210-rst"; + #reset-cells = <1>; + }; diff --git a/include/dt-bindings/reset/k210-rst.h b/include/dt-bindings/reset/k210-rst.h new file mode 100644 index 000000000000..883c1aed50e8 --- /dev/null +++ b/include/dt-bindings/reset/k210-rst.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2019 Sean Anderson + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef RESET_K210_SYSCTL_H +#define RESET_K210_SYSCTL_H + +/* + * Kendryte K210 SoC system controller K210_SYSCTL_SOFT_RESET register bits. + * Taken from Kendryte SDK (kendryte-standalone-sdk). + */ +#define K210_RST_ROM 0 +#define K210_RST_DMA 1 +#define K210_RST_AI 2 +#define K210_RST_DVP 3 +#define K210_RST_FFT 4 +#define K210_RST_GPIO 5 +#define K210_RST_SPI0 6 +#define K210_RST_SPI1 7 +#define K210_RST_SPI2 8 +#define K210_RST_SPI3 9 +#define K210_RST_I2S0 10 +#define K210_RST_I2S1 11 +#define K210_RST_I2S2 12 +#define K210_RST_I2C0 13 +#define K210_RST_I2C1 14 +#define K210_RST_I2C2 15 +#define K210_RST_UART1 16 +#define K210_RST_UART2 17 +#define K210_RST_UART3 18 +#define K210_RST_AES 19 +#define K210_RST_FPIOA 20 +#define K210_RST_TIMER0 21 +#define K210_RST_TIMER1 22 +#define K210_RST_TIMER2 23 +#define K210_RST_WDT0 24 +#define K210_RST_WDT1 25 +#define K210_RST_SHA 26 +#define K210_RST_RTC 29 + +#endif /* RESET_K210_SYSCTL_H */ From ed3137edb31b86702511e7ad12b4abe8686b6805 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:44 +0900 Subject: [PATCH 021/506] dt-bindings: pinctrl: Document canaan,k210-fpioa bindings Document the device tree bindings for the Canaan Kendryte K210 SoC Fully Programmable IO Array (FPIOA) pinctrl driver in Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml. The new header file include/dt-bindings/pinctrl/k210-fpioa.h is added to define all 256 possible pin functions of the SoC IO pins, as well as macros simplifying the definition of pin functions in a device tree. Signed-off-by: Damien Le Moal Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../bindings/pinctrl/canaan,k210-fpioa.yaml | 171 +++++++++++ include/dt-bindings/pinctrl/k210-fpioa.h | 276 ++++++++++++++++++ 2 files changed, 447 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml create mode 100644 include/dt-bindings/pinctrl/k210-fpioa.h diff --git a/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml b/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml new file mode 100644 index 000000000000..46fbc73ab26b --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/canaan,k210-fpioa.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Canaan Kendryte K210 FPIOA Device Tree Bindings + +maintainers: + - Damien Le Moal + +description: + The Canaan Kendryte K210 SoC Fully Programmable IO Array (FPIOA) + controller allows assiging any of 256 possible functions to any of + 48 IO pins of the SoC. Pin function configuration is performed on + a per-pin basis. + +properties: + compatible: + const: canaan,k210-fpioa + + reg: + maxItems: 1 + description: + Address and length of the register set for the FPIOA controller. + + clocks: + items: + - description: Controller reference clock source + - description: APB interface clock source + + clock-names: + items: + - const: ref + - const: pclk + + resets: + maxItems: 1 + + canaan,k210-sysctl-power: + $ref: /schemas/types.yaml#/definitions/phandle-array + description: | + phandle of the K210 system controller node and offset of its + power domain control register. + +patternProperties: + '-pinmux$': + type: object + $ref: /schemas/pinctrl/pinmux-node.yaml + description: + FPIOA client devices use sub-nodes to define the desired pin + configuration. Client device sub-nodes use the pinux property + below. + + properties: + pinmux: + description: + List of IO pins alternate functions. The values for each IO + pin is a combination of an IO pin number (0 to 47) with the + desired function for the IO pin. Functions are defined as + macros in include/dt-bindings/pinctrl/k210-fpioa.h. + The K210_FPIOA(IO pin, function) macro is provided to + facilitate the combination of IO pin numbers and functions. + + required: + - pinmux + + additionalProperties: false + + '-pins$': + type: object + $ref: /schemas/pinctrl/pincfg-node.yaml + description: + FPIOA client devices use sub-nodes to define the desired + configuration of pins. Client device sub-nodes use the + properties below. + + properties: + pins: + description: + List of IO pins affected by the properties specified in this + subnode. IO pins are identified using the pin names "IO_xx". + Pin configuration nodes can also define the power domain to + be used for the SoC pin groups A0 (IO pins 0-5), + A1 (IO pins 6-11), A2 (IO pins 12-17), B0 (IO pins 18-23), + B1 (IO pins 24-29), B2 (IO pins 30-35), B3 (IO pins 30-35), + C0 (IO pins 36-41) and C1 (IO pins 42-47) using the + power-source property. + items: + anyOf: + - pattern: "^(IO_([0-9]*))|(A[0-2])|(B[3-5])|(C[6-7])$" + - enum: [ IO_0, IO_1, IO_2, IO_3, IO_4, IO_5, IO_6, IO_7, + IO_8, IO_9, IO_10, IO_11, IO_12, IO_13, IO_14, + IO_15, IO_16, IO_17, IO_18, IO_19, IO_20, IO_21, + IO_22, IO_23, IO_24, IO_25, IO_26, IO_27, IO_28, + IO_29, IO_30, IO_31, IO_32, IO_33, IO_34, IO_35, + IO_36, IO_37, IO_38, IO_39, IO_40, IO_41, IO_42, + IO_43, IO_44, IO_45, IO_46, IO_47, + A0, A1, A2, B3, B4, B5, C6, C7 ] + bias-disable: true + + bias-pull-down: true + + bias-pull-up: true + + drive-strength: true + + drive-strength-microamp: true + + input-enable: true + + input-disable: true + + input-schmitt-enable: true + + input-schmitt-disable: true + + input-polarity-invert: + description: + Enable or disable pin input polarity inversion. + + output-enable: true + + output-disable: true + + output-high: true + + output-low: true + + output-polarity-invert: + description: + Enable or disable pin output polarity inversion. + + slew-rate: true + + power-source: true + + additionalProperties: false + +required: + - compatible + - reg + - clocks + - canaan,k210-sysctl-power + +additionalProperties: false + +examples: + - | + #include + #include + #include + + fpioa: pinmux@502B0000 { + compatible = "canaan,k210-fpioa"; + reg = <0x502B0000 0x100>; + clocks = <&sysclk K210_CLK_FPIOA>, + <&sysclk K210_CLK_APB0>; + clock-names = "ref", "pclk"; + resets = <&sysrst K210_RST_FPIOA>; + canaan,k210-sysctl-power = <&sysctl 108>; + pinctrl-0 = <&jtag_pinctrl>; + pinctrl-names = "default"; + + jtag_pinctrl: jtag-pinmux { + pinmux = , + , + , + ; + }; + }; diff --git a/include/dt-bindings/pinctrl/k210-fpioa.h b/include/dt-bindings/pinctrl/k210-fpioa.h new file mode 100644 index 000000000000..314285eab3a1 --- /dev/null +++ b/include/dt-bindings/pinctrl/k210-fpioa.h @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2020 Sean Anderson + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef PINCTRL_K210_FPIOA_H +#define PINCTRL_K210_FPIOA_H + +/* + * Full list of FPIOA functions from + * kendryte-standalone-sdk/lib/drivers/include/fpioa.h + */ +#define K210_PCF_MASK GENMASK(7, 0) +#define K210_PCF_JTAG_TCLK 0 /* JTAG Test Clock */ +#define K210_PCF_JTAG_TDI 1 /* JTAG Test Data In */ +#define K210_PCF_JTAG_TMS 2 /* JTAG Test Mode Select */ +#define K210_PCF_JTAG_TDO 3 /* JTAG Test Data Out */ +#define K210_PCF_SPI0_D0 4 /* SPI0 Data 0 */ +#define K210_PCF_SPI0_D1 5 /* SPI0 Data 1 */ +#define K210_PCF_SPI0_D2 6 /* SPI0 Data 2 */ +#define K210_PCF_SPI0_D3 7 /* SPI0 Data 3 */ +#define K210_PCF_SPI0_D4 8 /* SPI0 Data 4 */ +#define K210_PCF_SPI0_D5 9 /* SPI0 Data 5 */ +#define K210_PCF_SPI0_D6 10 /* SPI0 Data 6 */ +#define K210_PCF_SPI0_D7 11 /* SPI0 Data 7 */ +#define K210_PCF_SPI0_SS0 12 /* SPI0 Chip Select 0 */ +#define K210_PCF_SPI0_SS1 13 /* SPI0 Chip Select 1 */ +#define K210_PCF_SPI0_SS2 14 /* SPI0 Chip Select 2 */ +#define K210_PCF_SPI0_SS3 15 /* SPI0 Chip Select 3 */ +#define K210_PCF_SPI0_ARB 16 /* SPI0 Arbitration */ +#define K210_PCF_SPI0_SCLK 17 /* SPI0 Serial Clock */ +#define K210_PCF_UARTHS_RX 18 /* UART High speed Receiver */ +#define K210_PCF_UARTHS_TX 19 /* UART High speed Transmitter */ +#define K210_PCF_RESV6 20 /* Reserved function */ +#define K210_PCF_RESV7 21 /* Reserved function */ +#define K210_PCF_CLK_SPI1 22 /* Clock SPI1 */ +#define K210_PCF_CLK_I2C1 23 /* Clock I2C1 */ +#define K210_PCF_GPIOHS0 24 /* GPIO High speed 0 */ +#define K210_PCF_GPIOHS1 25 /* GPIO High speed 1 */ +#define K210_PCF_GPIOHS2 26 /* GPIO High speed 2 */ +#define K210_PCF_GPIOHS3 27 /* GPIO High speed 3 */ +#define K210_PCF_GPIOHS4 28 /* GPIO High speed 4 */ +#define K210_PCF_GPIOHS5 29 /* GPIO High speed 5 */ +#define K210_PCF_GPIOHS6 30 /* GPIO High speed 6 */ +#define K210_PCF_GPIOHS7 31 /* GPIO High speed 7 */ +#define K210_PCF_GPIOHS8 32 /* GPIO High speed 8 */ +#define K210_PCF_GPIOHS9 33 /* GPIO High speed 9 */ +#define K210_PCF_GPIOHS10 34 /* GPIO High speed 10 */ +#define K210_PCF_GPIOHS11 35 /* GPIO High speed 11 */ +#define K210_PCF_GPIOHS12 36 /* GPIO High speed 12 */ +#define K210_PCF_GPIOHS13 37 /* GPIO High speed 13 */ +#define K210_PCF_GPIOHS14 38 /* GPIO High speed 14 */ +#define K210_PCF_GPIOHS15 39 /* GPIO High speed 15 */ +#define K210_PCF_GPIOHS16 40 /* GPIO High speed 16 */ +#define K210_PCF_GPIOHS17 41 /* GPIO High speed 17 */ +#define K210_PCF_GPIOHS18 42 /* GPIO High speed 18 */ +#define K210_PCF_GPIOHS19 43 /* GPIO High speed 19 */ +#define K210_PCF_GPIOHS20 44 /* GPIO High speed 20 */ +#define K210_PCF_GPIOHS21 45 /* GPIO High speed 21 */ +#define K210_PCF_GPIOHS22 46 /* GPIO High speed 22 */ +#define K210_PCF_GPIOHS23 47 /* GPIO High speed 23 */ +#define K210_PCF_GPIOHS24 48 /* GPIO High speed 24 */ +#define K210_PCF_GPIOHS25 49 /* GPIO High speed 25 */ +#define K210_PCF_GPIOHS26 50 /* GPIO High speed 26 */ +#define K210_PCF_GPIOHS27 51 /* GPIO High speed 27 */ +#define K210_PCF_GPIOHS28 52 /* GPIO High speed 28 */ +#define K210_PCF_GPIOHS29 53 /* GPIO High speed 29 */ +#define K210_PCF_GPIOHS30 54 /* GPIO High speed 30 */ +#define K210_PCF_GPIOHS31 55 /* GPIO High speed 31 */ +#define K210_PCF_GPIO0 56 /* GPIO pin 0 */ +#define K210_PCF_GPIO1 57 /* GPIO pin 1 */ +#define K210_PCF_GPIO2 58 /* GPIO pin 2 */ +#define K210_PCF_GPIO3 59 /* GPIO pin 3 */ +#define K210_PCF_GPIO4 60 /* GPIO pin 4 */ +#define K210_PCF_GPIO5 61 /* GPIO pin 5 */ +#define K210_PCF_GPIO6 62 /* GPIO pin 6 */ +#define K210_PCF_GPIO7 63 /* GPIO pin 7 */ +#define K210_PCF_UART1_RX 64 /* UART1 Receiver */ +#define K210_PCF_UART1_TX 65 /* UART1 Transmitter */ +#define K210_PCF_UART2_RX 66 /* UART2 Receiver */ +#define K210_PCF_UART2_TX 67 /* UART2 Transmitter */ +#define K210_PCF_UART3_RX 68 /* UART3 Receiver */ +#define K210_PCF_UART3_TX 69 /* UART3 Transmitter */ +#define K210_PCF_SPI1_D0 70 /* SPI1 Data 0 */ +#define K210_PCF_SPI1_D1 71 /* SPI1 Data 1 */ +#define K210_PCF_SPI1_D2 72 /* SPI1 Data 2 */ +#define K210_PCF_SPI1_D3 73 /* SPI1 Data 3 */ +#define K210_PCF_SPI1_D4 74 /* SPI1 Data 4 */ +#define K210_PCF_SPI1_D5 75 /* SPI1 Data 5 */ +#define K210_PCF_SPI1_D6 76 /* SPI1 Data 6 */ +#define K210_PCF_SPI1_D7 77 /* SPI1 Data 7 */ +#define K210_PCF_SPI1_SS0 78 /* SPI1 Chip Select 0 */ +#define K210_PCF_SPI1_SS1 79 /* SPI1 Chip Select 1 */ +#define K210_PCF_SPI1_SS2 80 /* SPI1 Chip Select 2 */ +#define K210_PCF_SPI1_SS3 81 /* SPI1 Chip Select 3 */ +#define K210_PCF_SPI1_ARB 82 /* SPI1 Arbitration */ +#define K210_PCF_SPI1_SCLK 83 /* SPI1 Serial Clock */ +#define K210_PCF_SPI2_D0 84 /* SPI2 Data 0 */ +#define K210_PCF_SPI2_SS 85 /* SPI2 Select */ +#define K210_PCF_SPI2_SCLK 86 /* SPI2 Serial Clock */ +#define K210_PCF_I2S0_MCLK 87 /* I2S0 Master Clock */ +#define K210_PCF_I2S0_SCLK 88 /* I2S0 Serial Clock(BCLK) */ +#define K210_PCF_I2S0_WS 89 /* I2S0 Word Select(LRCLK) */ +#define K210_PCF_I2S0_IN_D0 90 /* I2S0 Serial Data Input 0 */ +#define K210_PCF_I2S0_IN_D1 91 /* I2S0 Serial Data Input 1 */ +#define K210_PCF_I2S0_IN_D2 92 /* I2S0 Serial Data Input 2 */ +#define K210_PCF_I2S0_IN_D3 93 /* I2S0 Serial Data Input 3 */ +#define K210_PCF_I2S0_OUT_D0 94 /* I2S0 Serial Data Output 0 */ +#define K210_PCF_I2S0_OUT_D1 95 /* I2S0 Serial Data Output 1 */ +#define K210_PCF_I2S0_OUT_D2 96 /* I2S0 Serial Data Output 2 */ +#define K210_PCF_I2S0_OUT_D3 97 /* I2S0 Serial Data Output 3 */ +#define K210_PCF_I2S1_MCLK 98 /* I2S1 Master Clock */ +#define K210_PCF_I2S1_SCLK 99 /* I2S1 Serial Clock(BCLK) */ +#define K210_PCF_I2S1_WS 100 /* I2S1 Word Select(LRCLK) */ +#define K210_PCF_I2S1_IN_D0 101 /* I2S1 Serial Data Input 0 */ +#define K210_PCF_I2S1_IN_D1 102 /* I2S1 Serial Data Input 1 */ +#define K210_PCF_I2S1_IN_D2 103 /* I2S1 Serial Data Input 2 */ +#define K210_PCF_I2S1_IN_D3 104 /* I2S1 Serial Data Input 3 */ +#define K210_PCF_I2S1_OUT_D0 105 /* I2S1 Serial Data Output 0 */ +#define K210_PCF_I2S1_OUT_D1 106 /* I2S1 Serial Data Output 1 */ +#define K210_PCF_I2S1_OUT_D2 107 /* I2S1 Serial Data Output 2 */ +#define K210_PCF_I2S1_OUT_D3 108 /* I2S1 Serial Data Output 3 */ +#define K210_PCF_I2S2_MCLK 109 /* I2S2 Master Clock */ +#define K210_PCF_I2S2_SCLK 110 /* I2S2 Serial Clock(BCLK) */ +#define K210_PCF_I2S2_WS 111 /* I2S2 Word Select(LRCLK) */ +#define K210_PCF_I2S2_IN_D0 112 /* I2S2 Serial Data Input 0 */ +#define K210_PCF_I2S2_IN_D1 113 /* I2S2 Serial Data Input 1 */ +#define K210_PCF_I2S2_IN_D2 114 /* I2S2 Serial Data Input 2 */ +#define K210_PCF_I2S2_IN_D3 115 /* I2S2 Serial Data Input 3 */ +#define K210_PCF_I2S2_OUT_D0 116 /* I2S2 Serial Data Output 0 */ +#define K210_PCF_I2S2_OUT_D1 117 /* I2S2 Serial Data Output 1 */ +#define K210_PCF_I2S2_OUT_D2 118 /* I2S2 Serial Data Output 2 */ +#define K210_PCF_I2S2_OUT_D3 119 /* I2S2 Serial Data Output 3 */ +#define K210_PCF_RESV0 120 /* Reserved function */ +#define K210_PCF_RESV1 121 /* Reserved function */ +#define K210_PCF_RESV2 122 /* Reserved function */ +#define K210_PCF_RESV3 123 /* Reserved function */ +#define K210_PCF_RESV4 124 /* Reserved function */ +#define K210_PCF_RESV5 125 /* Reserved function */ +#define K210_PCF_I2C0_SCLK 126 /* I2C0 Serial Clock */ +#define K210_PCF_I2C0_SDA 127 /* I2C0 Serial Data */ +#define K210_PCF_I2C1_SCLK 128 /* I2C1 Serial Clock */ +#define K210_PCF_I2C1_SDA 129 /* I2C1 Serial Data */ +#define K210_PCF_I2C2_SCLK 130 /* I2C2 Serial Clock */ +#define K210_PCF_I2C2_SDA 131 /* I2C2 Serial Data */ +#define K210_PCF_DVP_XCLK 132 /* DVP System Clock */ +#define K210_PCF_DVP_RST 133 /* DVP System Reset */ +#define K210_PCF_DVP_PWDN 134 /* DVP Power Down Mode */ +#define K210_PCF_DVP_VSYNC 135 /* DVP Vertical Sync */ +#define K210_PCF_DVP_HSYNC 136 /* DVP Horizontal Sync */ +#define K210_PCF_DVP_PCLK 137 /* Pixel Clock */ +#define K210_PCF_DVP_D0 138 /* Data Bit 0 */ +#define K210_PCF_DVP_D1 139 /* Data Bit 1 */ +#define K210_PCF_DVP_D2 140 /* Data Bit 2 */ +#define K210_PCF_DVP_D3 141 /* Data Bit 3 */ +#define K210_PCF_DVP_D4 142 /* Data Bit 4 */ +#define K210_PCF_DVP_D5 143 /* Data Bit 5 */ +#define K210_PCF_DVP_D6 144 /* Data Bit 6 */ +#define K210_PCF_DVP_D7 145 /* Data Bit 7 */ +#define K210_PCF_SCCB_SCLK 146 /* Serial Camera Control Bus Clock */ +#define K210_PCF_SCCB_SDA 147 /* Serial Camera Control Bus Data */ +#define K210_PCF_UART1_CTS 148 /* UART1 Clear To Send */ +#define K210_PCF_UART1_DSR 149 /* UART1 Data Set Ready */ +#define K210_PCF_UART1_DCD 150 /* UART1 Data Carrier Detect */ +#define K210_PCF_UART1_RI 151 /* UART1 Ring Indicator */ +#define K210_PCF_UART1_SIR_IN 152 /* UART1 Serial Infrared Input */ +#define K210_PCF_UART1_DTR 153 /* UART1 Data Terminal Ready */ +#define K210_PCF_UART1_RTS 154 /* UART1 Request To Send */ +#define K210_PCF_UART1_OUT2 155 /* UART1 User-designated Output 2 */ +#define K210_PCF_UART1_OUT1 156 /* UART1 User-designated Output 1 */ +#define K210_PCF_UART1_SIR_OUT 157 /* UART1 Serial Infrared Output */ +#define K210_PCF_UART1_BAUD 158 /* UART1 Transmit Clock Output */ +#define K210_PCF_UART1_RE 159 /* UART1 Receiver Output Enable */ +#define K210_PCF_UART1_DE 160 /* UART1 Driver Output Enable */ +#define K210_PCF_UART1_RS485_EN 161 /* UART1 RS485 Enable */ +#define K210_PCF_UART2_CTS 162 /* UART2 Clear To Send */ +#define K210_PCF_UART2_DSR 163 /* UART2 Data Set Ready */ +#define K210_PCF_UART2_DCD 164 /* UART2 Data Carrier Detect */ +#define K210_PCF_UART2_RI 165 /* UART2 Ring Indicator */ +#define K210_PCF_UART2_SIR_IN 166 /* UART2 Serial Infrared Input */ +#define K210_PCF_UART2_DTR 167 /* UART2 Data Terminal Ready */ +#define K210_PCF_UART2_RTS 168 /* UART2 Request To Send */ +#define K210_PCF_UART2_OUT2 169 /* UART2 User-designated Output 2 */ +#define K210_PCF_UART2_OUT1 170 /* UART2 User-designated Output 1 */ +#define K210_PCF_UART2_SIR_OUT 171 /* UART2 Serial Infrared Output */ +#define K210_PCF_UART2_BAUD 172 /* UART2 Transmit Clock Output */ +#define K210_PCF_UART2_RE 173 /* UART2 Receiver Output Enable */ +#define K210_PCF_UART2_DE 174 /* UART2 Driver Output Enable */ +#define K210_PCF_UART2_RS485_EN 175 /* UART2 RS485 Enable */ +#define K210_PCF_UART3_CTS 176 /* UART3 Clear To Send */ +#define K210_PCF_UART3_DSR 177 /* UART3 Data Set Ready */ +#define K210_PCF_UART3_DCD 178 /* UART3 Data Carrier Detect */ +#define K210_PCF_UART3_RI 179 /* UART3 Ring Indicator */ +#define K210_PCF_UART3_SIR_IN 180 /* UART3 Serial Infrared Input */ +#define K210_PCF_UART3_DTR 181 /* UART3 Data Terminal Ready */ +#define K210_PCF_UART3_RTS 182 /* UART3 Request To Send */ +#define K210_PCF_UART3_OUT2 183 /* UART3 User-designated Output 2 */ +#define K210_PCF_UART3_OUT1 184 /* UART3 User-designated Output 1 */ +#define K210_PCF_UART3_SIR_OUT 185 /* UART3 Serial Infrared Output */ +#define K210_PCF_UART3_BAUD 186 /* UART3 Transmit Clock Output */ +#define K210_PCF_UART3_RE 187 /* UART3 Receiver Output Enable */ +#define K210_PCF_UART3_DE 188 /* UART3 Driver Output Enable */ +#define K210_PCF_UART3_RS485_EN 189 /* UART3 RS485 Enable */ +#define K210_PCF_TIMER0_TOGGLE1 190 /* TIMER0 Toggle Output 1 */ +#define K210_PCF_TIMER0_TOGGLE2 191 /* TIMER0 Toggle Output 2 */ +#define K210_PCF_TIMER0_TOGGLE3 192 /* TIMER0 Toggle Output 3 */ +#define K210_PCF_TIMER0_TOGGLE4 193 /* TIMER0 Toggle Output 4 */ +#define K210_PCF_TIMER1_TOGGLE1 194 /* TIMER1 Toggle Output 1 */ +#define K210_PCF_TIMER1_TOGGLE2 195 /* TIMER1 Toggle Output 2 */ +#define K210_PCF_TIMER1_TOGGLE3 196 /* TIMER1 Toggle Output 3 */ +#define K210_PCF_TIMER1_TOGGLE4 197 /* TIMER1 Toggle Output 4 */ +#define K210_PCF_TIMER2_TOGGLE1 198 /* TIMER2 Toggle Output 1 */ +#define K210_PCF_TIMER2_TOGGLE2 199 /* TIMER2 Toggle Output 2 */ +#define K210_PCF_TIMER2_TOGGLE3 200 /* TIMER2 Toggle Output 3 */ +#define K210_PCF_TIMER2_TOGGLE4 201 /* TIMER2 Toggle Output 4 */ +#define K210_PCF_CLK_SPI2 202 /* Clock SPI2 */ +#define K210_PCF_CLK_I2C2 203 /* Clock I2C2 */ +#define K210_PCF_INTERNAL0 204 /* Internal function signal 0 */ +#define K210_PCF_INTERNAL1 205 /* Internal function signal 1 */ +#define K210_PCF_INTERNAL2 206 /* Internal function signal 2 */ +#define K210_PCF_INTERNAL3 207 /* Internal function signal 3 */ +#define K210_PCF_INTERNAL4 208 /* Internal function signal 4 */ +#define K210_PCF_INTERNAL5 209 /* Internal function signal 5 */ +#define K210_PCF_INTERNAL6 210 /* Internal function signal 6 */ +#define K210_PCF_INTERNAL7 211 /* Internal function signal 7 */ +#define K210_PCF_INTERNAL8 212 /* Internal function signal 8 */ +#define K210_PCF_INTERNAL9 213 /* Internal function signal 9 */ +#define K210_PCF_INTERNAL10 214 /* Internal function signal 10 */ +#define K210_PCF_INTERNAL11 215 /* Internal function signal 11 */ +#define K210_PCF_INTERNAL12 216 /* Internal function signal 12 */ +#define K210_PCF_INTERNAL13 217 /* Internal function signal 13 */ +#define K210_PCF_INTERNAL14 218 /* Internal function signal 14 */ +#define K210_PCF_INTERNAL15 219 /* Internal function signal 15 */ +#define K210_PCF_INTERNAL16 220 /* Internal function signal 16 */ +#define K210_PCF_INTERNAL17 221 /* Internal function signal 17 */ +#define K210_PCF_CONSTANT 222 /* Constant function */ +#define K210_PCF_INTERNAL18 223 /* Internal function signal 18 */ +#define K210_PCF_DEBUG0 224 /* Debug function 0 */ +#define K210_PCF_DEBUG1 225 /* Debug function 1 */ +#define K210_PCF_DEBUG2 226 /* Debug function 2 */ +#define K210_PCF_DEBUG3 227 /* Debug function 3 */ +#define K210_PCF_DEBUG4 228 /* Debug function 4 */ +#define K210_PCF_DEBUG5 229 /* Debug function 5 */ +#define K210_PCF_DEBUG6 230 /* Debug function 6 */ +#define K210_PCF_DEBUG7 231 /* Debug function 7 */ +#define K210_PCF_DEBUG8 232 /* Debug function 8 */ +#define K210_PCF_DEBUG9 233 /* Debug function 9 */ +#define K210_PCF_DEBUG10 234 /* Debug function 10 */ +#define K210_PCF_DEBUG11 235 /* Debug function 11 */ +#define K210_PCF_DEBUG12 236 /* Debug function 12 */ +#define K210_PCF_DEBUG13 237 /* Debug function 13 */ +#define K210_PCF_DEBUG14 238 /* Debug function 14 */ +#define K210_PCF_DEBUG15 239 /* Debug function 15 */ +#define K210_PCF_DEBUG16 240 /* Debug function 16 */ +#define K210_PCF_DEBUG17 241 /* Debug function 17 */ +#define K210_PCF_DEBUG18 242 /* Debug function 18 */ +#define K210_PCF_DEBUG19 243 /* Debug function 19 */ +#define K210_PCF_DEBUG20 244 /* Debug function 20 */ +#define K210_PCF_DEBUG21 245 /* Debug function 21 */ +#define K210_PCF_DEBUG22 246 /* Debug function 22 */ +#define K210_PCF_DEBUG23 247 /* Debug function 23 */ +#define K210_PCF_DEBUG24 248 /* Debug function 24 */ +#define K210_PCF_DEBUG25 249 /* Debug function 25 */ +#define K210_PCF_DEBUG26 250 /* Debug function 26 */ +#define K210_PCF_DEBUG27 251 /* Debug function 27 */ +#define K210_PCF_DEBUG28 252 /* Debug function 28 */ +#define K210_PCF_DEBUG29 253 /* Debug function 29 */ +#define K210_PCF_DEBUG30 254 /* Debug function 30 */ +#define K210_PCF_DEBUG31 255 /* Debug function 31 */ + +#define K210_FPIOA(pin, func) (((pin) << 16) | (func)) + +#define K210_PC_POWER_3V3 0 +#define K210_PC_POWER_1V8 1 + +#endif /* PINCTRL_K210_FPIOA_H */ From 5a2308da9f60312addfb5f7bdd15d96ba1299480 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:47 +0900 Subject: [PATCH 022/506] riscv: Add Canaan Kendryte K210 reset controller Add a reset controller driver for the Canaan Kendryte K210 SoC. This driver relies on its syscon compatible parent node (sysctl) for its register mapping. Default this driver compilation to y when the SOC_CANAAN option is selected. The MAINTAINERS file is updated, adding the entry "CANAAN/KENDRYTE K210 SOC RESET CONTROLLER DRIVER" with myself listed as maintainer for this driver. Signed-off-by: Damien Le Moal Reviewed-by: Philipp Zabel Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 8 +++ arch/riscv/Kconfig.socs | 1 + drivers/reset/Kconfig | 10 +++ drivers/reset/Makefile | 1 + drivers/reset/reset-k210.c | 131 +++++++++++++++++++++++++++++++++++++ 5 files changed, 151 insertions(+) create mode 100644 drivers/reset/reset-k210.c diff --git a/MAINTAINERS b/MAINTAINERS index 4aaa7a17016b..380a446d4d4d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3860,6 +3860,14 @@ W: https://github.com/Cascoda/ca8210-linux.git F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c +CANAAN/KENDRYTE K210 SOC RESET CONTROLLER DRIVER +M: Damien Le Moal +L: linux-kernel@vger.kernel.org +L: linux-riscv@lists.infradead.org +S: Maintained +F: Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml +F: drivers/reset/reset-k210.c + CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER M: Damien Le Moal L: linux-riscv@lists.infradead.org diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 027042e4d0af..57e53219c500 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -29,6 +29,7 @@ config SOC_CANAAN select SERIAL_SIFIVE if TTY select SERIAL_SIFIVE_CONSOLE if TTY select SIFIVE_PLIC + select ARCH_HAS_RESET_CONTROLLER help This enables support for Canaan Kendryte K210 SoC platform hardware. diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index 71ab75a46491..3db04991a16b 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -89,6 +89,16 @@ config RESET_INTEL_GW Say Y to control the reset signals provided by reset controller. Otherwise, say N. +config RESET_K210 + bool "Reset controller driver for Canaan Kendryte K210 SoC" + depends on (SOC_CANAAN || COMPILE_TEST) && OF + select MFD_SYSCON + default SOC_CANAAN + help + Support for the Canaan Kendryte K210 RISC-V SoC reset controller. + Say Y if you want to control reset signals provided by this + controller. + config RESET_LANTIQ bool "Lantiq XWAY Reset Driver" if COMPILE_TEST default SOC_TYPE_XWAY diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile index 1054123fd187..65a118a91b27 100644 --- a/drivers/reset/Makefile +++ b/drivers/reset/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_RESET_BRCMSTB_RESCAL) += reset-brcmstb-rescal.o obj-$(CONFIG_RESET_HSDK) += reset-hsdk.o obj-$(CONFIG_RESET_IMX7) += reset-imx7.o obj-$(CONFIG_RESET_INTEL_GW) += reset-intel-gw.o +obj-$(CONFIG_RESET_K210) += reset-k210.o obj-$(CONFIG_RESET_LANTIQ) += reset-lantiq.o obj-$(CONFIG_RESET_LPC18XX) += reset-lpc18xx.o obj-$(CONFIG_RESET_MESON) += reset-meson.o diff --git a/drivers/reset/reset-k210.c b/drivers/reset/reset-k210.c new file mode 100644 index 000000000000..1b6e03522b40 --- /dev/null +++ b/drivers/reset/reset-k210.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define K210_RST_MASK 0x27FFFFFF + +struct k210_rst { + struct regmap *map; + struct reset_controller_dev rcdev; +}; + +static inline struct k210_rst * +to_k210_rst(struct reset_controller_dev *rcdev) +{ + return container_of(rcdev, struct k210_rst, rcdev); +} + +static inline int k210_rst_assert(struct reset_controller_dev *rcdev, + unsigned long id) +{ + struct k210_rst *ksr = to_k210_rst(rcdev); + + return regmap_update_bits(ksr->map, K210_SYSCTL_PERI_RESET, BIT(id), 1); +} + +static inline int k210_rst_deassert(struct reset_controller_dev *rcdev, + unsigned long id) +{ + struct k210_rst *ksr = to_k210_rst(rcdev); + + return regmap_update_bits(ksr->map, K210_SYSCTL_PERI_RESET, BIT(id), 0); +} + +static int k210_rst_reset(struct reset_controller_dev *rcdev, + unsigned long id) +{ + int ret; + + ret = k210_rst_assert(rcdev, id); + if (ret == 0) { + udelay(10); + ret = k210_rst_deassert(rcdev, id); + } + + return ret; +} + +static int k210_rst_status(struct reset_controller_dev *rcdev, + unsigned long id) +{ + struct k210_rst *ksr = to_k210_rst(rcdev); + u32 reg, bit = BIT(id); + int ret; + + ret = regmap_read(ksr->map, K210_SYSCTL_PERI_RESET, ®); + if (ret) + return ret; + + return reg & bit; +} + +static int k210_rst_xlate(struct reset_controller_dev *rcdev, + const struct of_phandle_args *reset_spec) +{ + unsigned long id = reset_spec->args[0]; + + if (!(BIT(id) & K210_RST_MASK)) + return -EINVAL; + + return id; +} + +static const struct reset_control_ops k210_rst_ops = { + .assert = k210_rst_assert, + .deassert = k210_rst_deassert, + .reset = k210_rst_reset, + .status = k210_rst_status, +}; + +static int k210_rst_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *parent_np = of_get_parent(dev->of_node); + struct k210_rst *ksr; + + dev_info(dev, "K210 reset controller\n"); + + ksr = devm_kzalloc(dev, sizeof(*ksr), GFP_KERNEL); + if (!ksr) + return -ENOMEM; + + ksr->map = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); + if (IS_ERR(ksr->map)) + return PTR_ERR(ksr->map); + + ksr->rcdev.owner = THIS_MODULE; + ksr->rcdev.dev = dev; + ksr->rcdev.of_node = dev->of_node; + ksr->rcdev.ops = &k210_rst_ops; + ksr->rcdev.nr_resets = fls(K210_RST_MASK); + ksr->rcdev.of_reset_n_cells = 1; + ksr->rcdev.of_xlate = k210_rst_xlate; + + return devm_reset_controller_register(dev, &ksr->rcdev); +} + +static const struct of_device_id k210_rst_dt_ids[] = { + { .compatible = "canaan,k210-rst" }, + { /* sentinel */ }, +}; + +static struct platform_driver k210_rst_driver = { + .probe = k210_rst_probe, + .driver = { + .name = "k210-rst", + .of_match_table = k210_rst_dt_ids, + }, +}; +builtin_platform_driver(k210_rst_driver); From eb75541f8b4535cf22e22cd2e60734866868e818 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 18 Nov 2020 16:38:25 -0800 Subject: [PATCH 023/506] arm64, numa: Change the numa init functions name to be generic This is a preparatory patch for unifying numa implementation between ARM64 & RISC-V. As the numa implementation will be moved to generic code, rename the arm64 related functions to a generic one. Signed-off-by: Atish Patra Acked-by: Catalin Marinas Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/numa.h | 4 ++-- arch/arm64/kernel/acpi_numa.c | 12 ------------ arch/arm64/mm/init.c | 4 ++-- arch/arm64/mm/numa.c | 27 +++++++++++++++++++++++---- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index dd870390d639..ffc1dcdf1871 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -32,7 +32,7 @@ static inline const struct cpumask *cpumask_of_node(int node) } #endif -void __init arm64_numa_init(void); +void __init arch_numa_init(void); int __init numa_add_memblk(int nodeid, u64 start, u64 end); void __init numa_set_distance(int from, int to, int distance); void __init numa_free_distance(void); @@ -46,7 +46,7 @@ void numa_remove_cpu(unsigned int cpu); static inline void numa_store_cpu_info(unsigned int cpu) { } static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } -static inline void arm64_numa_init(void) { } +static inline void arch_numa_init(void) { } static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } #endif /* CONFIG_NUMA */ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index 7ff800045434..fdfecf0991ce 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -118,15 +118,3 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) node_set(node, numa_nodes_parsed); } -int __init arm64_acpi_numa_init(void) -{ - int ret; - - ret = acpi_numa_init(); - if (ret) { - pr_info("Failed to initialise from firmware\n"); - return ret; - } - - return srat_disabled() ? -EINVAL : 0; -} diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 75addb36354a..35adf6a9e98e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -418,10 +418,10 @@ void __init bootmem_init(void) max_pfn = max_low_pfn = max; min_low_pfn = min; - arm64_numa_init(); + arch_numa_init(); /* - * must be done after arm64_numa_init() which calls numa_init() to + * must be done after arch_numa_init() which calls numa_init() to * initialize node_online_map that gets used in hugetlb_cma_reserve() * while allocating required CMA size across online nodes. */ diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index a8303bc6b62a..0dae54ce7d43 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -13,7 +13,6 @@ #include #include -#include #include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; @@ -445,16 +444,36 @@ static int __init dummy_numa_init(void) return 0; } +#ifdef CONFIG_ACPI_NUMA +static int __init arch_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret) { + pr_info("Failed to initialise from firmware\n"); + return ret; + } + + return srat_disabled() ? -EINVAL : 0; +} +#else +static int __init arch_acpi_numa_init(void) +{ + return -EOPNOTSUPP; +} +#endif + /** - * arm64_numa_init() - Initialize NUMA + * arch_numa_init() - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encompassing whole memory. */ -void __init arm64_numa_init(void) +void __init arch_numa_init(void) { if (!numa_off) { - if (!acpi_disabled && !numa_init(arm64_acpi_numa_init)) + if (!acpi_disabled && !numa_init(arch_acpi_numa_init)) return; if (acpi_disabled && !numa_init(of_numa_init)) return; From ae3c107cd8bea82cb7cb427d9c5d305b8ce72216 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 18 Nov 2020 16:38:26 -0800 Subject: [PATCH 024/506] numa: Move numa implementation to common code ARM64 numa implementation is generic enough that RISC-V can reuse that implementation with very minor cosmetic changes. This will help both ARM64 and RISC-V in terms of maintanace and feature improvement Move the numa implementation code to common directory so that both ISAs can reuse this. This doesn't introduce any function changes for ARM64. Signed-off-by: Atish Patra Acked-by: Jonathan Cameron Tested-by: Jonathan Cameron Acked-by: Catalin Marinas Signed-off-by: Palmer Dabbelt --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/numa.h | 48 +---------------- arch/arm64/mm/Makefile | 1 - drivers/base/Kconfig | 6 +++ drivers/base/Makefile | 1 + .../mm/numa.c => drivers/base/arch_numa.c | 0 include/asm-generic/numa.h | 52 +++++++++++++++++++ 7 files changed, 61 insertions(+), 48 deletions(-) rename arch/arm64/mm/numa.c => drivers/base/arch_numa.c (100%) create mode 100644 include/asm-generic/numa.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 05e17351e4f3..ff9f5c05cca3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -995,6 +995,7 @@ config HOTPLUG_CPU # Common NUMA Features config NUMA bool "NUMA Memory Allocation and Scheduler Support" + select GENERIC_ARCH_NUMA select ACPI_NUMA if ACPI select OF_NUMA help diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index ffc1dcdf1871..8c8cf4297cc3 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -3,52 +3,6 @@ #define __ASM_NUMA_H #include - -#ifdef CONFIG_NUMA - -#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) - -int __node_distance(int from, int to); -#define node_distance(a, b) __node_distance(a, b) - -extern nodemask_t numa_nodes_parsed __initdata; - -extern bool numa_off; - -/* Mappings between node number and cpus on that node. */ -extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; -void numa_clear_node(unsigned int cpu); - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -const struct cpumask *cpumask_of_node(int node); -#else -/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -static inline const struct cpumask *cpumask_of_node(int node) -{ - if (node == NUMA_NO_NODE) - return cpu_all_mask; - - return node_to_cpumask_map[node]; -} -#endif - -void __init arch_numa_init(void); -int __init numa_add_memblk(int nodeid, u64 start, u64 end); -void __init numa_set_distance(int from, int to, int distance); -void __init numa_free_distance(void); -void __init early_map_cpu_to_node(unsigned int cpu, int nid); -void numa_store_cpu_info(unsigned int cpu); -void numa_add_cpu(unsigned int cpu); -void numa_remove_cpu(unsigned int cpu); - -#else /* CONFIG_NUMA */ - -static inline void numa_store_cpu_info(unsigned int cpu) { } -static inline void numa_add_cpu(unsigned int cpu) { } -static inline void numa_remove_cpu(unsigned int cpu) { } -static inline void arch_numa_init(void) { } -static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } - -#endif /* CONFIG_NUMA */ +#include #endif /* __ASM_NUMA_H */ diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..cd60e4fed78f 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -6,7 +6,6 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o -obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o KASAN_SANITIZE_physaddr.o += n diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 040be48ce046..dbd88e2be88a 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -213,4 +213,10 @@ config GENERIC_ARCH_TOPOLOGY appropriate scaling, sysfs interface for reading capacity values at runtime. +config GENERIC_ARCH_NUMA + bool + help + Enable support for generic NUMA implementation. Currently, RISC-V + and ARM64 use it. + endmenu diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 5e7bf9669a81..8b93a7f291ec 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_PINCTRL) += pinctrl.o obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o +obj-$(CONFIG_GENERIC_ARCH_NUMA) += arch_numa.o obj-y += test/ diff --git a/arch/arm64/mm/numa.c b/drivers/base/arch_numa.c similarity index 100% rename from arch/arm64/mm/numa.c rename to drivers/base/arch_numa.c diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h new file mode 100644 index 000000000000..1a3ad6d29833 --- /dev/null +++ b/include/asm-generic/numa.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_NUMA_H +#define __ASM_GENERIC_NUMA_H + +#ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) + +int __node_distance(int from, int to); +#define node_distance(a, b) __node_distance(a, b) + +extern nodemask_t numa_nodes_parsed __initdata; + +extern bool numa_off; + +/* Mappings between node number and cpus on that node. */ +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +void numa_clear_node(unsigned int cpu); + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +const struct cpumask *cpumask_of_node(int node); +#else +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline const struct cpumask *cpumask_of_node(int node) +{ + if (node == NUMA_NO_NODE) + return cpu_all_mask; + + return node_to_cpumask_map[node]; +} +#endif + +void __init arch_numa_init(void); +int __init numa_add_memblk(int nodeid, u64 start, u64 end); +void __init numa_set_distance(int from, int to, int distance); +void __init numa_free_distance(void); +void __init early_map_cpu_to_node(unsigned int cpu, int nid); +void numa_store_cpu_info(unsigned int cpu); +void numa_add_cpu(unsigned int cpu); +void numa_remove_cpu(unsigned int cpu); + +#else /* CONFIG_NUMA */ + +static inline void numa_store_cpu_info(unsigned int cpu) { } +static inline void numa_add_cpu(unsigned int cpu) { } +static inline void numa_remove_cpu(unsigned int cpu) { } +static inline void arch_numa_init(void) { } +static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } + +#endif /* CONFIG_NUMA */ + +#endif /* __ASM_GENERIC_NUMA_H */ From cbd34f4bb37d62d8a027f54205bff07e73340da4 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 18 Nov 2020 16:38:27 -0800 Subject: [PATCH 025/506] riscv: Separate memory init from paging init Currently, we perform some memory init functions in paging init. But, that will be an issue for NUMA support where DT needs to be flattened before numa initialization and memblock_present can only be called after numa initialization. Move memory initialization related functions to a separate function. Signed-off-by: Atish Patra Reviewed-by: Greentime Hu Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 1 + arch/riscv/kernel/setup.c | 1 + arch/riscv/mm/init.c | 6 +++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 41a72861987c..4dd56ea4a687 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -469,6 +469,7 @@ extern void *dtb_early_va; extern uintptr_t dtb_early_pa; void setup_bootmem(void); void paging_init(void); +void misc_mem_init(void); #define FIRST_USER_ADDRESS 0 diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index a43a954ef529..9cd81d4df031 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -250,6 +250,7 @@ void __init setup_arch(char **cmdline_p) else pr_err("No DTB found in kernel mappings\n"); #endif + misc_mem_init(); sbi_init(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 77bd23f47a72..62716b43660b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -649,8 +649,12 @@ void mark_rodata_ro(void) void __init paging_init(void) { setup_vm_final(); - sparse_init(); setup_zero_page(); +} + +void __init misc_mem_init(void) +{ + sparse_init(); zone_sizes_init(); } From 3e5b0bdb2a4dd8c09a9db01b1ead3f69cabd0c67 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Wed, 18 Nov 2020 16:38:28 -0800 Subject: [PATCH 026/506] riscv: Add support pte_protnone and pmd_protnone if CONFIG_NUMA_BALANCING These two functions are used to distinguish between PROT_NONENUMA protections and hinting fault protections. Signed-off-by: Greentime Hu Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4dd56ea4a687..25e90cf0bde4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -187,6 +187,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) return (unsigned long)pfn_to_virt(pmd_val(pmd) >> _PAGE_PFN_SHIFT); } +static inline pte_t pmd_pte(pmd_t pmd) +{ + return __pte(pmd_val(pmd)); +} + /* Yields the page frame number (PFN) of a page table entry */ static inline unsigned long pte_pfn(pte_t pte) { @@ -290,6 +295,21 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +#ifdef CONFIG_NUMA_BALANCING +/* + * See the comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) == _PAGE_PROT_NONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pte_protnone(pmd_pte(pmd)); +} +#endif + /* Modify page protection bits */ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { From 4f0e8eef772ee4438f304b2178bc28c958b6c13d Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 18 Nov 2020 16:38:29 -0800 Subject: [PATCH 027/506] riscv: Add numa support for riscv64 platform Use the generic numa implementation to add NUMA support for RISC-V. This is based on Greentime's patch[1] but modified to use generic NUMA implementation and few more fixes. [1] https://lkml.org/lkml/2020/1/10/233 Co-developed-by: Greentime Hu Signed-off-by: Greentime Hu Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 31 ++++++++++++++++++++++++++++++- arch/riscv/include/asm/mmzone.h | 13 +++++++++++++ arch/riscv/include/asm/numa.h | 8 ++++++++ arch/riscv/include/asm/pci.h | 14 ++++++++++++++ arch/riscv/kernel/setup.c | 10 ++++++++-- arch/riscv/kernel/smpboot.c | 12 +++++++++++- arch/riscv/mm/init.c | 4 +++- 7 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 arch/riscv/include/asm/mmzone.h create mode 100644 arch/riscv/include/asm/numa.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 81b76d44725d..2ef05ef921b5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -143,7 +143,7 @@ config PAGE_OFFSET default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB config ARCH_FLATMEM_ENABLE - def_bool y + def_bool !NUMA config ARCH_SPARSEMEM_ENABLE def_bool y @@ -298,6 +298,35 @@ config TUNE_GENERIC endchoice +# Common NUMA Features +config NUMA + bool "NUMA Memory Allocation and Scheduler Support" + select GENERIC_ARCH_NUMA + select OF_NUMA + select ARCH_SUPPORTS_NUMA_BALANCING + help + Enable NUMA (Non-Uniform Memory Access) support. + + The kernel will try to allocate memory used by a CPU on the + local memory of the CPU and add some more NUMA awareness to the kernel. + +config NODES_SHIFT + int "Maximum NUMA Nodes (as a power of 2)" + range 1 10 + default "2" + depends on NEED_MULTIPLE_NODES + help + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accommodate various tables. + +config USE_PERCPU_NUMA_NODE_ID + def_bool y + depends on NUMA + +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + depends on NUMA + config RISCV_ISA_C bool "Emit compressed instructions when building Linux" default y diff --git a/arch/riscv/include/asm/mmzone.h b/arch/riscv/include/asm/mmzone.h new file mode 100644 index 000000000000..fa17e01d9ab2 --- /dev/null +++ b/arch/riscv/include/asm/mmzone.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMZONE_H +#define __ASM_MMZONE_H + +#ifdef CONFIG_NUMA + +#include + +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[(nid)]) + +#endif /* CONFIG_NUMA */ +#endif /* __ASM_MMZONE_H */ diff --git a/arch/riscv/include/asm/numa.h b/arch/riscv/include/asm/numa.h new file mode 100644 index 000000000000..8c8cf4297cc3 --- /dev/null +++ b/arch/riscv/include/asm/numa.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_NUMA_H +#define __ASM_NUMA_H + +#include +#include + +#endif /* __ASM_NUMA_H */ diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h index 1c473a1bd986..658e112c3ce7 100644 --- a/arch/riscv/include/asm/pci.h +++ b/arch/riscv/include/asm/pci.h @@ -32,6 +32,20 @@ static inline int pci_proc_domain(struct pci_bus *bus) /* always show the domain in /proc */ return 1; } + +#ifdef CONFIG_NUMA + +static inline int pcibus_to_node(struct pci_bus *bus) +{ + return dev_to_node(&bus->dev); +} +#ifndef cpumask_of_pcibus +#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ + cpu_all_mask : \ + cpumask_of_node(pcibus_to_node(bus))) +#endif +#endif /* CONFIG_NUMA */ + #endif /* CONFIG_PCI */ #endif /* _ASM_RISCV_PCI_H */ diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 9cd81d4df031..90a4e06886a6 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -273,13 +273,19 @@ void __init setup_arch(char **cmdline_p) static int __init topology_init(void) { - int i; + int i, ret; + + for_each_online_node(i) + register_one_node(i); for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_devices, i); cpu->hotpluggable = cpu_has_hotplug(i); - register_cpu(cpu, i); + ret = register_cpu(cpu, i); + if (unlikely(ret)) + pr_warn("Warning: %s: register_cpu %d failed (%d)\n", + __func__, i, ret); } return 0; diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 96167d55ed98..5e276c25646f 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -45,13 +46,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus) { int cpuid; int ret; + unsigned int curr_cpuid; + + curr_cpuid = smp_processor_id(); + numa_store_cpu_info(curr_cpuid); + numa_add_cpu(curr_cpuid); /* This covers non-smp usecase mandated by "nosmp" option */ if (max_cpus == 0) return; for_each_possible_cpu(cpuid) { - if (cpuid == smp_processor_id()) + if (cpuid == curr_cpuid) continue; if (cpu_ops[cpuid]->cpu_prepare) { ret = cpu_ops[cpuid]->cpu_prepare(cpuid); @@ -59,6 +65,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) continue; } set_cpu_present(cpuid, true); + numa_store_cpu_info(cpuid); } } @@ -79,6 +86,7 @@ void __init setup_smp(void) if (hart == cpuid_to_hartid_map(0)) { BUG_ON(found_boot_cpu); found_boot_cpu = 1; + early_map_cpu_to_node(0, of_node_to_nid(dn)); continue; } if (cpuid >= NR_CPUS) { @@ -88,6 +96,7 @@ void __init setup_smp(void) } cpuid_to_hartid_map(cpuid) = hart; + early_map_cpu_to_node(cpuid, of_node_to_nid(dn)); cpuid++; } @@ -153,6 +162,7 @@ asmlinkage __visible void smp_callin(void) current->active_mm = mm; notify_cpu_starting(curr_cpuid); + numa_add_cpu(curr_cpuid); update_siblings_masks(curr_cpuid); set_cpu_online(curr_cpuid, 1); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 62716b43660b..30b61f2c6b87 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "../kernel/head.h" @@ -199,7 +200,6 @@ void __init setup_bootmem(void) early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); memblock_allow_resize(); - memblock_dump_all(); } #ifdef CONFIG_MMU @@ -654,8 +654,10 @@ void __init paging_init(void) void __init misc_mem_init(void) { + arch_numa_init(); sparse_init(); zone_sizes_init(); + memblock_dump_all(); } #ifdef CONFIG_SPARSEMEM_VMEMMAP From 46ad48e8a28da7cc37a16c7e7fc632ecf906e4bf Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 25 Nov 2020 19:44:15 +0800 Subject: [PATCH 028/506] riscv: Add machine name to kernel boot log and stack dump output Add the machine name to kernel boot-up log, and install the machine name to stack dump for DT boot mode. Signed-off-by: Kefeng Wang Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 90a4e06886a6..4945b5085241 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -214,8 +214,15 @@ static void __init init_resources(void) static void __init parse_dtb(void) { /* Early scan of device tree from init memory */ - if (early_init_dt_scan(dtb_early_va)) + if (early_init_dt_scan(dtb_early_va)) { + const char *name = of_flat_dt_get_machine_name(); + + if (name) { + pr_info("Machine model: %s\n", name); + dump_stack_set_arch_desc("%s (DT)", name); + } return; + } pr_err("No DTB passed to the kernel\n"); #ifdef CONFIG_CMDLINE_FORCE From dcdc7a53a890218a16cd6e2a69e526bd96eb9399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20St=C3=A4hlin?= Date: Thu, 17 Dec 2020 16:01:37 +0000 Subject: [PATCH 029/506] RISC-V: Implement ptrace regs and stack API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Needed for kprobes support. Copied and adapted from arm64 code. Guo Ren fixup pt_regs type for linux-5.8-rc1. Signed-off-by: Patrick Stählin Signed-off-by: Guo Ren Reviewed-by: Pekka Enberg Reviewed-by: Zong Li Reviewed-by: Masami Hiramatsu Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/ptrace.h | 29 ++++++++++ arch/riscv/kernel/ptrace.c | 99 +++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 2ef05ef921b5..1027384e2632 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -75,6 +75,7 @@ config RISCV select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index ee49f80c9533..23372bbcfde8 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -8,6 +8,7 @@ #include #include +#include #ifndef __ASSEMBLY__ @@ -60,6 +61,7 @@ struct pt_regs { #define user_mode(regs) (((regs)->status & SR_PP) == 0) +#define MAX_REG_OFFSET offsetof(struct pt_regs, orig_a0) /* Helpers for working with the instruction pointer */ static inline unsigned long instruction_pointer(struct pt_regs *regs) @@ -85,6 +87,12 @@ static inline void user_stack_pointer_set(struct pt_regs *regs, regs->sp = val; } +/* Valid only for Kernel mode traps. */ +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->sp; +} + /* Helpers for working with the frame pointer */ static inline unsigned long frame_pointer(struct pt_regs *regs) { @@ -101,6 +109,27 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->a0; } +extern int regs_query_register_offset(const char *name); +extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n); + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten + * @offset: offset of the register. + * + * regs_get_register returns the value of a register whose offset from @regs. + * The @offset is the offset of the register in struct pt_regs. + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + + return *(unsigned long *)((unsigned long)regs + offset); +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2d6395f5ad54..1a85305720e8 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -114,6 +114,105 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) return &riscv_user_native_view; } +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { + REG_OFFSET_NAME(epc), + REG_OFFSET_NAME(ra), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(gp), + REG_OFFSET_NAME(tp), + REG_OFFSET_NAME(t0), + REG_OFFSET_NAME(t1), + REG_OFFSET_NAME(t2), + REG_OFFSET_NAME(s0), + REG_OFFSET_NAME(s1), + REG_OFFSET_NAME(a0), + REG_OFFSET_NAME(a1), + REG_OFFSET_NAME(a2), + REG_OFFSET_NAME(a3), + REG_OFFSET_NAME(a4), + REG_OFFSET_NAME(a5), + REG_OFFSET_NAME(a6), + REG_OFFSET_NAME(a7), + REG_OFFSET_NAME(s2), + REG_OFFSET_NAME(s3), + REG_OFFSET_NAME(s4), + REG_OFFSET_NAME(s5), + REG_OFFSET_NAME(s6), + REG_OFFSET_NAME(s7), + REG_OFFSET_NAME(s8), + REG_OFFSET_NAME(s9), + REG_OFFSET_NAME(s10), + REG_OFFSET_NAME(s11), + REG_OFFSET_NAME(t3), + REG_OFFSET_NAME(t4), + REG_OFFSET_NAME(t5), + REG_OFFSET_NAME(t6), + REG_OFFSET_NAME(status), + REG_OFFSET_NAME(badaddr), + REG_OFFSET_NAME(cause), + REG_OFFSET_NAME(orig_a0), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_within_kernel_stack() - check the address in the stack + * @regs: pt_regs which contains kernel stack pointer. + * @addr: address which is checked. + * + * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). + * If @addr is within the kernel stack, it returns true. If not, returns false. + */ +static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + return (addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specified by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + void ptrace_disable(struct task_struct *child) { clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); From edfcf91fe4f84984860416a58719b48a71893909 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:38 +0000 Subject: [PATCH 030/506] riscv: Fixup compile error BUILD_BUG_ON failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unfortunately, the current code couldn't be compiled: CC arch/riscv/kernel/patch.o In file included from ./include/linux/kernel.h:11, from ./include/linux/list.h:9, from ./include/linux/preempt.h:11, from ./include/linux/spinlock.h:51, from arch/riscv/kernel/patch.c:6: In function ‘fix_to_virt’, inlined from ‘patch_map’ at arch/riscv/kernel/patch.c:37:17: ./include/linux/compiler.h:392:38: error: call to ‘__compiletime_assert_205’ declared with attribute error: BUILD_BUG_ON failed: idx >= __end_of_fixed_addresses _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ^ ./include/linux/compiler.h:373:4: note: in definition of macro ‘__compiletime_assert’ prefix ## suffix(); \ ^~~~~~ ./include/linux/compiler.h:392:2: note: in expansion of macro ‘_compiletime_assert’ _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ^~~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:39:37: note: in expansion of macro ‘compiletime_assert’ #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) ^~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:50:2: note: in expansion of macro ‘BUILD_BUG_ON_MSG’ BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) ^~~~~~~~~~~~~~~~ ./include/asm-generic/fixmap.h:32:2: note: in expansion of macro ‘BUILD_BUG_ON’ BUILD_BUG_ON(idx >= __end_of_fixed_addresses); ^~~~~~~~~~~~ Because fix_to_virt(, idx) needs a const value, not a dynamic variable of reg-a0 or BUILD_BUG_ON failed with "idx >= __end_of_fixed_addresses". Signed-off-by: Guo Ren Reviewed-by: Masami Hiramatsu Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 3fe7a5296aa5..0b552873a577 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -20,7 +20,12 @@ struct patch_insn { }; #ifdef CONFIG_MMU -static void *patch_map(void *addr, int fixmap) +/* + * The fix_to_virt(, idx) needs a const value (not a dynamic variable of + * reg-a0) or BUILD_BUG_ON failed with "idx >= __end_of_fixed_addresses". + * So use '__always_inline' and 'const unsigned int fixmap' here. + */ +static __always_inline void *patch_map(void *addr, const unsigned int fixmap) { uintptr_t uintaddr = (uintptr_t) addr; struct page *page; @@ -37,7 +42,6 @@ static void *patch_map(void *addr, int fixmap) return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + (uintaddr & ~PAGE_MASK)); } -NOKPROBE_SYMBOL(patch_map); static void patch_unmap(int fixmap) { From 67d945778099b14324811fe67c5aff2cda7a7ad5 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:39 +0000 Subject: [PATCH 031/506] riscv: Fixup wrong ftrace remove cflag We must use $(CC_FLAGS_FTRACE) instead of directly using -pg. It will cause -fpatchable-function-entry error. Signed-off-by: Guo Ren Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 4 ++-- arch/riscv/mm/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f6caf4d9ca15..22c66dde18a0 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -4,8 +4,8 @@ # ifdef CONFIG_FTRACE -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_patch.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) endif extra-y += head.o diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index c0185e556ca5..6b4b7ec1bda2 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -2,7 +2,7 @@ CFLAGS_init.o := -mcmodel=medany ifdef CONFIG_FTRACE -CFLAGS_REMOVE_init.o = -pg +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) endif KCOV_INSTRUMENT_init.o := n From 5ad84adf5456313e285734102367c861c436c5ed Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:40 +0000 Subject: [PATCH 032/506] riscv: Fixup patch_text panic in ftrace Just like arm64, we can't trace the function in the patch_text path. Here is the bug log: [ 45.234334] Unable to handle kernel paging request at virtual address ffffffd38ae80900 [ 45.242313] Oops [#1] [ 45.244600] Modules linked in: [ 45.247678] CPU: 0 PID: 11 Comm: migration/0 Not tainted 5.9.0-00025-g9b7db83-dirty #215 [ 45.255797] epc: ffffffe00021689a ra : ffffffe00021718e sp : ffffffe01afabb58 [ 45.262955] gp : ffffffe00136afa0 tp : ffffffe01af94d00 t0 : 0000000000000002 [ 45.270200] t1 : 0000000000000000 t2 : 0000000000000001 s0 : ffffffe01afabc08 [ 45.277443] s1 : ffffffe0013718a8 a0 : 0000000000000000 a1 : ffffffe01afabba8 [ 45.284686] a2 : 0000000000000000 a3 : 0000000000000000 a4 : c4c16ad38ae80900 [ 45.291929] a5 : 0000000000000000 a6 : 0000000000000000 a7 : 0000000052464e43 [ 45.299173] s2 : 0000000000000001 s3 : ffffffe000206a60 s4 : ffffffe000206a60 [ 45.306415] s5 : 00000000000009ec s6 : ffffffe0013718a8 s7 : c4c16ad38ae80900 [ 45.313658] s8 : 0000000000000004 s9 : 0000000000000001 s10: 0000000000000001 [ 45.320902] s11: 0000000000000003 t3 : 0000000000000001 t4 : ffffffffd192fe79 [ 45.328144] t5 : ffffffffb8f80000 t6 : 0000000000040000 [ 45.333472] status: 0000000200000100 badaddr: ffffffd38ae80900 cause: 000000000000000f [ 45.341514] ---[ end trace d95102172248fdcf ]--- [ 45.346176] note: migration/0[11] exited with preempt_count 1 (gdb) x /2i $pc => 0xffffffe00021689a <__do_proc_dointvec+196>: sd zero,0(s7) 0xffffffe00021689e <__do_proc_dointvec+200>: li s11,0 (gdb) bt 0 __do_proc_dointvec (tbl_data=0x0, table=0xffffffe01afabba8, write=0, buffer=0x0, lenp=0x7bf897061f9a0800, ppos=0x4, conv=0x0, data=0x52464e43) at kernel/sysctl.c:581 1 0xffffffe00021718e in do_proc_dointvec (data=, conv=, ppos=, lenp=, buffer=, write=, table=) at kernel/sysctl.c:964 2 proc_dointvec_minmax (ppos=, lenp=, buffer=, write=, table=) at kernel/sysctl.c:964 3 proc_do_static_key (table=, write=1, buffer=0x0, lenp=0x0, ppos=0x7bf897061f9a0800) at kernel/sysctl.c:1643 4 0xffffffe000206792 in ftrace_make_call (rec=, addr=) at arch/riscv/kernel/ftrace.c:109 5 0xffffffe0002c9c04 in __ftrace_replace_code (rec=0xffffffe01ae40c30, enable=3) at kernel/trace/ftrace.c:2503 6 0xffffffe0002ca0b2 in ftrace_replace_code (mod_flags=) at kernel/trace/ftrace.c:2530 7 0xffffffe0002ca26a in ftrace_modify_all_code (command=5) at kernel/trace/ftrace.c:2677 8 0xffffffe0002ca30e in __ftrace_modify_code (data=) at kernel/trace/ftrace.c:2703 9 0xffffffe0002c13b0 in multi_cpu_stop (data=0x0) at kernel/stop_machine.c:224 10 0xffffffe0002c0fde in cpu_stopper_thread (cpu=) at kernel/stop_machine.c:491 11 0xffffffe0002343de in smpboot_thread_fn (data=0x0) at kernel/smpboot.c:165 12 0xffffffe00022f8b4 in kthread (_create=0xffffffe01af0c040) at kernel/kthread.c:292 13 0xffffffe000201fac in handle_exception () at arch/riscv/kernel/entry.S:236 0xffffffe00020678a <+114>: auipc ra,0xffffe 0xffffffe00020678e <+118>: jalr -118(ra) # 0xffffffe000204714 0xffffffe000206792 <+122>: snez a0,a0 (gdb) disassemble patch_text_nosync Dump of assembler code for function patch_text_nosync: 0xffffffe000204714 <+0>: addi sp,sp,-32 0xffffffe000204716 <+2>: sd s0,16(sp) 0xffffffe000204718 <+4>: sd ra,24(sp) 0xffffffe00020471a <+6>: addi s0,sp,32 0xffffffe00020471c <+8>: auipc ra,0x0 0xffffffe000204720 <+12>: jalr -384(ra) # 0xffffffe00020459c 0xffffffe000204724 <+16>: beqz a0,0xffffffe00020472e 0xffffffe000204726 <+18>: ld ra,24(sp) 0xffffffe000204728 <+20>: ld s0,16(sp) 0xffffffe00020472a <+22>: addi sp,sp,32 0xffffffe00020472c <+24>: ret 0xffffffe00020472e <+26>: sd a0,-24(s0) 0xffffffe000204732 <+30>: auipc ra,0x4 0xffffffe000204736 <+34>: jalr -1464(ra) # 0xffffffe00020817a 0xffffffe00020473a <+38>: ld a0,-24(s0) 0xffffffe00020473e <+42>: ld ra,24(sp) 0xffffffe000204740 <+44>: ld s0,16(sp) 0xffffffe000204742 <+46>: addi sp,sp,32 0xffffffe000204744 <+48>: ret (gdb) disassemble flush_icache_all-4 Dump of assembler code for function flush_icache_all: 0xffffffe00020817a <+0>: addi sp,sp,-8 0xffffffe00020817c <+2>: sd ra,0(sp) 0xffffffe00020817e <+4>: auipc ra,0xfffff 0xffffffe000208182 <+8>: jalr -1822(ra) # 0xffffffe000206a60 0xffffffe000208186 <+12>: ld ra,0(sp) 0xffffffe000208188 <+14>: addi sp,sp,8 0xffffffe00020818a <+0>: addi sp,sp,-16 0xffffffe00020818c <+2>: sd s0,0(sp) 0xffffffe00020818e <+4>: sd ra,8(sp) 0xffffffe000208190 <+6>: addi s0,sp,16 0xffffffe000208192 <+8>: li a0,0 0xffffffe000208194 <+10>: auipc ra,0xfffff 0xffffffe000208198 <+14>: jalr -410(ra) # 0xffffffe000206ffa 0xffffffe00020819c <+18>: ld s0,0(sp) 0xffffffe00020819e <+20>: ld ra,8(sp) 0xffffffe0002081a0 <+22>: addi sp,sp,16 0xffffffe0002081a2 <+24>: ret (gdb) frame 5 (rec=0xffffffe01ae40c30, enable=3) at kernel/trace/ftrace.c:2503 2503 return ftrace_make_call(rec, ftrace_addr); (gdb) p /x rec->ip $2 = 0xffffffe00020817a -> flush_icache_all ! When we modified flush_icache_all's patchable-entry with ftrace_caller: - Insert ftrace_caller at flush_icache_all prologue. - Call flush_icache_all to sync I/Dcache, but flush_icache_all is just we modified by half. Link: https://lore.kernel.org/linux-riscv/CAJF2gTT=oDWesWe0JVWvTpGi60-gpbNhYLdFWN_5EbyeqoEDdw@mail.gmail.com/T/#t Signed-off-by: Guo Ren Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 1 + arch/riscv/mm/Makefile | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 22c66dde18a0..4eee315d7954 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -6,6 +6,7 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) endif extra-y += head.o diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 6b4b7ec1bda2..7ebaef10ea1b 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -3,6 +3,7 @@ CFLAGS_init.o := -mcmodel=medany ifdef CONFIG_FTRACE CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cacheflush.o = $(CC_FLAGS_FTRACE) endif KCOV_INSTRUMENT_init.o := n From afc76b8b80112189b6f11e67e19cf58301944814 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:41 +0000 Subject: [PATCH 033/506] riscv: Using PATCHABLE_FUNCTION_ENTRY instead of MCOUNT This patch changes the current detour mechanism of dynamic ftrace which has been discussed during LPC 2020 RISCV-MC [1]. Before the patch, we used mcount for detour: : addi sp,sp,-16 sd ra,8(sp) sd s0,0(sp) addi s0,sp,16 mv a5,ra mv a0,a5 auipc ra,0x0 -> nop jalr -296(ra) <_mcount@plt> ->nop ... After the patch, we use nop call site area for detour: : nop -> REG_S ra, -SZREG(sp) nop -> auipc ra, 0x? nop -> jalr ?(ra) nop -> REG_L ra, -SZREG(sp) ... The mcount mechanism is mixed with gcc function prologue which is not very clear. The patchable function entry just put 16 bytes nop before the front of the function prologue which could be filled with a separated detour mechanism. [1] https://www.linuxplumbersconf.org/event/7/contributions/807/ Signed-off-by: Guo Ren Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 + arch/riscv/kernel/ftrace.c | 95 ++++----- arch/riscv/kernel/mcount-dyn.S | 342 +++++++++++++++------------------ 3 files changed, 204 insertions(+), 235 deletions(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 597b58b2d4c0..2372464ad27d 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -12,6 +12,8 @@ OBJCOPYFLAGS := -O binary LDFLAGS_vmlinux := ifeq ($(CONFIG_DYNAMIC_FTRACE),y) LDFLAGS_vmlinux := --no-relax + KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY + CC_FLAGS_FTRACE := -fpatchable-function-entry=8 endif ifeq ($(CONFIG_64BIT)$(CONFIG_CMODEL_MEDLOW),yy) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 765b62434f30..7f1e5203de88 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -72,29 +72,56 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, return 0; } +/* + * Put 5 instructions with 16 bytes at the front of function within + * patchable function entry nops' area. + * + * 0: REG_S ra, -SZREG(sp) + * 1: auipc ra, 0x? + * 2: jalr -?(ra) + * 3: REG_L ra, -SZREG(sp) + * + * So the opcodes is: + * 0: 0xfe113c23 (sd)/0xfe112e23 (sw) + * 1: 0x???????? -> auipc + * 2: 0x???????? -> jalr + * 3: 0xff813083 (ld)/0xffc12083 (lw) + */ +#if __riscv_xlen == 64 +#define INSN0 0xfe113c23 +#define INSN3 0xff813083 +#elif __riscv_xlen == 32 +#define INSN0 0xfe112e23 +#define INSN3 0xffc12083 +#endif + +#define FUNC_ENTRY_SIZE 16 +#define FUNC_ENTRY_JMP 4 + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - int ret = ftrace_check_current_call(rec->ip, NULL); + unsigned int call[4] = {INSN0, 0, 0, INSN3}; + unsigned long target = addr; + unsigned long caller = rec->ip + FUNC_ENTRY_JMP; - if (ret) - return ret; + call[1] = to_auipc_insn((unsigned int)(target - caller)); + call[2] = to_jalr_insn((unsigned int)(target - caller)); - return __ftrace_modify_call(rec->ip, addr, true); + if (patch_text_nosync((void *)rec->ip, call, FUNC_ENTRY_SIZE)) + return -EPERM; + + return 0; } int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int call[2]; - int ret; + unsigned int nops[4] = {NOP4, NOP4, NOP4, NOP4}; - make_call(rec->ip, addr, call); - ret = ftrace_check_current_call(rec->ip, call); + if (patch_text_nosync((void *)rec->ip, nops, FUNC_ENTRY_SIZE)) + return -EPERM; - if (ret) - return ret; - - return __ftrace_modify_call(rec->ip, addr, false); + return 0; } @@ -139,15 +166,16 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { unsigned int call[2]; + unsigned long caller = rec->ip + FUNC_ENTRY_JMP; int ret; - make_call(rec->ip, old_addr, call); - ret = ftrace_check_current_call(rec->ip, call); + make_call(caller, old_addr, call); + ret = ftrace_check_current_call(caller, call); if (ret) return ret; - return __ftrace_modify_call(rec->ip, addr, true); + return __ftrace_modify_call(caller, addr, true); } #endif @@ -176,53 +204,30 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_DYNAMIC_FTRACE extern void ftrace_graph_call(void); +extern void ftrace_graph_regs_call(void); int ftrace_enable_ftrace_graph_caller(void) { - unsigned int call[2]; - static int init_graph = 1; int ret; - make_call(&ftrace_graph_call, &ftrace_stub, call); - - /* - * When enabling graph tracer for the first time, ftrace_graph_call - * should contains a call to ftrace_stub. Once it has been disabled, - * the 8-bytes at the position becomes NOPs. - */ - if (init_graph) { - ret = ftrace_check_current_call((unsigned long)&ftrace_graph_call, - call); - init_graph = 0; - } else { - ret = ftrace_check_current_call((unsigned long)&ftrace_graph_call, - NULL); - } - + ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, + (unsigned long)&prepare_ftrace_return, true); if (ret) return ret; - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, + return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, (unsigned long)&prepare_ftrace_return, true); } int ftrace_disable_ftrace_graph_caller(void) { - unsigned int call[2]; int ret; - make_call(&ftrace_graph_call, &prepare_ftrace_return, call); - - /* - * This is to make sure that ftrace_enable_ftrace_graph_caller - * did the right thing. - */ - ret = ftrace_check_current_call((unsigned long)&ftrace_graph_call, - call); - + ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, + (unsigned long)&prepare_ftrace_return, false); if (ret) return ret; - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, + return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, (unsigned long)&prepare_ftrace_return, false); } #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 35a6ed76cb8b..d171eca623b6 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -13,224 +13,186 @@ .text - .macro SAVE_ABI_STATE -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi sp, sp, -48 - sd s0, 32(sp) - sd ra, 40(sp) - addi s0, sp, 48 - sd t0, 24(sp) - sd t1, 16(sp) -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - sd t2, 8(sp) -#endif -#else - addi sp, sp, -16 - sd s0, 0(sp) - sd ra, 8(sp) - addi s0, sp, 16 -#endif +#define FENTRY_RA_OFFSET 12 +#define ABI_SIZE_ON_STACK 72 +#define ABI_A0 0 +#define ABI_A1 8 +#define ABI_A2 16 +#define ABI_A3 24 +#define ABI_A4 32 +#define ABI_A5 40 +#define ABI_A6 48 +#define ABI_A7 56 +#define ABI_RA 64 + + .macro SAVE_ABI + addi sp, sp, -SZREG + addi sp, sp, -ABI_SIZE_ON_STACK + + REG_S a0, ABI_A0(sp) + REG_S a1, ABI_A1(sp) + REG_S a2, ABI_A2(sp) + REG_S a3, ABI_A3(sp) + REG_S a4, ABI_A4(sp) + REG_S a5, ABI_A5(sp) + REG_S a6, ABI_A6(sp) + REG_S a7, ABI_A7(sp) + REG_S ra, ABI_RA(sp) .endm - .macro RESTORE_ABI_STATE -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - ld s0, 32(sp) - ld ra, 40(sp) - addi sp, sp, 48 -#else - ld ra, 8(sp) - ld s0, 0(sp) - addi sp, sp, 16 -#endif + .macro RESTORE_ABI + REG_L a0, ABI_A0(sp) + REG_L a1, ABI_A1(sp) + REG_L a2, ABI_A2(sp) + REG_L a3, ABI_A3(sp) + REG_L a4, ABI_A4(sp) + REG_L a5, ABI_A5(sp) + REG_L a6, ABI_A6(sp) + REG_L a7, ABI_A7(sp) + REG_L ra, ABI_RA(sp) + + addi sp, sp, ABI_SIZE_ON_STACK + addi sp, sp, SZREG .endm - .macro RESTORE_GRAPH_ARGS - ld a0, 24(sp) - ld a1, 16(sp) -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld a2, 8(sp) -#endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + .macro SAVE_ALL + addi sp, sp, -SZREG + addi sp, sp, -PT_SIZE_ON_STACK + + REG_S x1, PT_EPC(sp) + addi sp, sp, PT_SIZE_ON_STACK + REG_L x1, (sp) + addi sp, sp, -PT_SIZE_ON_STACK + REG_S x1, PT_RA(sp) + REG_L x1, PT_EPC(sp) + + REG_S x2, PT_SP(sp) + REG_S x3, PT_GP(sp) + REG_S x4, PT_TP(sp) + REG_S x5, PT_T0(sp) + REG_S x6, PT_T1(sp) + REG_S x7, PT_T2(sp) + REG_S x8, PT_S0(sp) + REG_S x9, PT_S1(sp) + REG_S x10, PT_A0(sp) + REG_S x11, PT_A1(sp) + REG_S x12, PT_A2(sp) + REG_S x13, PT_A3(sp) + REG_S x14, PT_A4(sp) + REG_S x15, PT_A5(sp) + REG_S x16, PT_A6(sp) + REG_S x17, PT_A7(sp) + REG_S x18, PT_S2(sp) + REG_S x19, PT_S3(sp) + REG_S x20, PT_S4(sp) + REG_S x21, PT_S5(sp) + REG_S x22, PT_S6(sp) + REG_S x23, PT_S7(sp) + REG_S x24, PT_S8(sp) + REG_S x25, PT_S9(sp) + REG_S x26, PT_S10(sp) + REG_S x27, PT_S11(sp) + REG_S x28, PT_T3(sp) + REG_S x29, PT_T4(sp) + REG_S x30, PT_T5(sp) + REG_S x31, PT_T6(sp) .endm -ENTRY(ftrace_graph_caller) - addi sp, sp, -16 - sd s0, 0(sp) - sd ra, 8(sp) - addi s0, sp, 16 -ftrace_graph_call: - .global ftrace_graph_call - /* - * Calling ftrace_enable/disable_ftrace_graph_caller would overwrite the - * call below. Check ftrace_modify_all_code for details. - */ - call ftrace_stub - ld ra, 8(sp) - ld s0, 0(sp) - addi sp, sp, 16 - ret -ENDPROC(ftrace_graph_caller) + .macro RESTORE_ALL + REG_L x1, PT_RA(sp) + addi sp, sp, PT_SIZE_ON_STACK + REG_S x1, (sp) + addi sp, sp, -PT_SIZE_ON_STACK + REG_L x1, PT_EPC(sp) + REG_L x2, PT_SP(sp) + REG_L x3, PT_GP(sp) + REG_L x4, PT_TP(sp) + REG_L x5, PT_T0(sp) + REG_L x6, PT_T1(sp) + REG_L x7, PT_T2(sp) + REG_L x8, PT_S0(sp) + REG_L x9, PT_S1(sp) + REG_L x10, PT_A0(sp) + REG_L x11, PT_A1(sp) + REG_L x12, PT_A2(sp) + REG_L x13, PT_A3(sp) + REG_L x14, PT_A4(sp) + REG_L x15, PT_A5(sp) + REG_L x16, PT_A6(sp) + REG_L x17, PT_A7(sp) + REG_L x18, PT_S2(sp) + REG_L x19, PT_S3(sp) + REG_L x20, PT_S4(sp) + REG_L x21, PT_S5(sp) + REG_L x22, PT_S6(sp) + REG_L x23, PT_S7(sp) + REG_L x24, PT_S8(sp) + REG_L x25, PT_S9(sp) + REG_L x26, PT_S10(sp) + REG_L x27, PT_S11(sp) + REG_L x28, PT_T3(sp) + REG_L x29, PT_T4(sp) + REG_L x30, PT_T5(sp) + REG_L x31, PT_T6(sp) + + addi sp, sp, PT_SIZE_ON_STACK + addi sp, sp, SZREG + .endm +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ ENTRY(ftrace_caller) - /* - * a0: the address in the caller when calling ftrace_caller - * a1: the caller's return address - * a2: the address of global variable function_trace_op - */ - ld a1, -8(s0) - addi a0, ra, -MCOUNT_INSN_SIZE - la t5, function_trace_op - ld a2, 0(t5) + SAVE_ABI -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* - * the graph tracer (specifically, prepare_ftrace_return) needs these - * arguments but for now the function tracer occupies the regs, so we - * save them in temporary regs to recover later. - */ - addi t0, s0, -8 - mv t1, a0 -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld t2, -16(s0) -#endif -#endif + addi a0, ra, -FENTRY_RA_OFFSET + la a1, function_trace_op + REG_L a2, 0(a1) + REG_L a1, ABI_SIZE_ON_STACK(sp) + mv a3, sp - SAVE_ABI_STATE ftrace_call: .global ftrace_call - /* - * For the dynamic ftrace to work, here we should reserve at least - * 8 bytes for a functional auipc-jalr pair. The following call - * serves this purpose. - * - * Calling ftrace_update_ftrace_func would overwrite the nops below. - * Check ftrace_modify_all_code for details. - */ call ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER - RESTORE_GRAPH_ARGS - call ftrace_graph_caller + addi a0, sp, ABI_SIZE_ON_STACK + REG_L a1, ABI_RA(sp) + addi a1, a1, -FENTRY_RA_OFFSET +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + mv a2, s0 #endif - - RESTORE_ABI_STATE +ftrace_graph_call: + .global ftrace_graph_call + call ftrace_stub +#endif + RESTORE_ABI ret ENDPROC(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - .macro SAVE_ALL - addi sp, sp, -(PT_SIZE_ON_STACK+16) - sd s0, (PT_SIZE_ON_STACK)(sp) - sd ra, (PT_SIZE_ON_STACK+8)(sp) - addi s0, sp, (PT_SIZE_ON_STACK+16) - - sd x1, PT_RA(sp) - sd x2, PT_SP(sp) - sd x3, PT_GP(sp) - sd x4, PT_TP(sp) - sd x5, PT_T0(sp) - sd x6, PT_T1(sp) - sd x7, PT_T2(sp) - sd x8, PT_S0(sp) - sd x9, PT_S1(sp) - sd x10, PT_A0(sp) - sd x11, PT_A1(sp) - sd x12, PT_A2(sp) - sd x13, PT_A3(sp) - sd x14, PT_A4(sp) - sd x15, PT_A5(sp) - sd x16, PT_A6(sp) - sd x17, PT_A7(sp) - sd x18, PT_S2(sp) - sd x19, PT_S3(sp) - sd x20, PT_S4(sp) - sd x21, PT_S5(sp) - sd x22, PT_S6(sp) - sd x23, PT_S7(sp) - sd x24, PT_S8(sp) - sd x25, PT_S9(sp) - sd x26, PT_S10(sp) - sd x27, PT_S11(sp) - sd x28, PT_T3(sp) - sd x29, PT_T4(sp) - sd x30, PT_T5(sp) - sd x31, PT_T6(sp) - .endm - - .macro RESTORE_ALL - ld x1, PT_RA(sp) - ld x2, PT_SP(sp) - ld x3, PT_GP(sp) - ld x4, PT_TP(sp) - ld x5, PT_T0(sp) - ld x6, PT_T1(sp) - ld x7, PT_T2(sp) - ld x8, PT_S0(sp) - ld x9, PT_S1(sp) - ld x10, PT_A0(sp) - ld x11, PT_A1(sp) - ld x12, PT_A2(sp) - ld x13, PT_A3(sp) - ld x14, PT_A4(sp) - ld x15, PT_A5(sp) - ld x16, PT_A6(sp) - ld x17, PT_A7(sp) - ld x18, PT_S2(sp) - ld x19, PT_S3(sp) - ld x20, PT_S4(sp) - ld x21, PT_S5(sp) - ld x22, PT_S6(sp) - ld x23, PT_S7(sp) - ld x24, PT_S8(sp) - ld x25, PT_S9(sp) - ld x26, PT_S10(sp) - ld x27, PT_S11(sp) - ld x28, PT_T3(sp) - ld x29, PT_T4(sp) - ld x30, PT_T5(sp) - ld x31, PT_T6(sp) - - ld s0, (PT_SIZE_ON_STACK)(sp) - ld ra, (PT_SIZE_ON_STACK+8)(sp) - addi sp, sp, (PT_SIZE_ON_STACK+16) - .endm - - .macro RESTORE_GRAPH_REG_ARGS - ld a0, PT_T0(sp) - ld a1, PT_T1(sp) -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld a2, PT_T2(sp) -#endif - .endm - -/* - * Most of the contents are the same as ftrace_caller. - */ ENTRY(ftrace_regs_caller) - /* - * a3: the address of all registers in the stack - */ - ld a1, -8(s0) - addi a0, ra, -MCOUNT_INSN_SIZE - la t5, function_trace_op - ld a2, 0(t5) - addi a3, sp, -(PT_SIZE_ON_STACK+16) - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi t0, s0, -8 - mv t1, a0 -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld t2, -16(s0) -#endif -#endif SAVE_ALL + addi a0, ra, -FENTRY_RA_OFFSET + la a1, function_trace_op + REG_L a2, 0(a1) + REG_L a1, PT_SIZE_ON_STACK(sp) + mv a3, sp + ftrace_regs_call: .global ftrace_regs_call call ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER - RESTORE_GRAPH_REG_ARGS - call ftrace_graph_caller + addi a0, sp, PT_RA + REG_L a1, PT_EPC(sp) + addi a1, a1, -FENTRY_RA_OFFSET +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + mv a2, s0 +#endif +ftrace_graph_regs_call: + .global ftrace_graph_regs_call + call ftrace_stub #endif RESTORE_ALL From c22b0bcb1dd024cb9caad9230e3a387d8b061df5 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:42 +0000 Subject: [PATCH 034/506] riscv: Add kprobes supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables "kprobe & kretprobe" to work with ftrace interface. It utilized software breakpoint as single-step mechanism. Some instructions which can't be single-step executed must be simulated in kernel execution slot, such as: branch, jal, auipc, la ... Some instructions should be rejected for probing and we use a blacklist to filter, such as: ecall, ebreak, ... We use ebreak & c.ebreak to replace origin instruction and the kprobe handler prepares an executable memory slot for out-of-line execution with a copy of the original instruction being probed. In execution slot we add ebreak behind original instruction to simulate a single-setp mechanism. The patch is based on packi's work [1] and csky's work [2]. - The kprobes_trampoline.S is all from packi's patch - The single-step mechanism is new designed for riscv without hw single-step trap - The simulation codes are from csky - Frankly, all codes refer to other archs' implementation [1] https://lore.kernel.org/linux-riscv/20181113195804.22825-1-me@packi.ch/ [2] https://lore.kernel.org/linux-csky/20200403044150.20562-9-guoren@kernel.org/ Signed-off-by: Guo Ren Co-developed-by: Patrick Stählin Signed-off-by: Patrick Stählin Acked-by: Masami Hiramatsu Tested-by: Zong Li Reviewed-by: Pekka Enberg Cc: Patrick Stählin Cc: Palmer Dabbelt Cc: Björn Töpel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 + arch/riscv/include/asm/kprobes.h | 40 ++ arch/riscv/include/asm/probes.h | 24 ++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/probes/Makefile | 4 + arch/riscv/kernel/probes/decode-insn.c | 48 +++ arch/riscv/kernel/probes/decode-insn.h | 18 + arch/riscv/kernel/probes/kprobes.c | 398 ++++++++++++++++++ arch/riscv/kernel/probes/kprobes_trampoline.S | 93 ++++ arch/riscv/kernel/probes/simulate-insn.c | 85 ++++ arch/riscv/kernel/probes/simulate-insn.h | 47 +++ arch/riscv/kernel/traps.c | 9 + arch/riscv/mm/fault.c | 4 + 13 files changed, 773 insertions(+) create mode 100644 arch/riscv/include/asm/probes.h create mode 100644 arch/riscv/kernel/probes/Makefile create mode 100644 arch/riscv/kernel/probes/decode-insn.c create mode 100644 arch/riscv/kernel/probes/decode-insn.h create mode 100644 arch/riscv/kernel/probes/kprobes.c create mode 100644 arch/riscv/kernel/probes/kprobes_trampoline.S create mode 100644 arch/riscv/kernel/probes/simulate-insn.c create mode 100644 arch/riscv/kernel/probes/simulate-insn.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 1027384e2632..375f6f4d3038 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -71,6 +71,8 @@ config RISCV select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO if MMU && 64BIT select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KPROBES + select HAVE_KRETPROBES select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index 56a98ea30731..4647d38018f6 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -11,4 +11,44 @@ #include +#ifdef CONFIG_KPROBES +#include +#include +#include + +#define __ARCH_WANT_KPROBES_INSN_SLOT +#define MAX_INSN_SIZE 2 + +#define flush_insn_slot(p) do { } while (0) +#define kretprobe_blacklist_size 0 + +#include + +struct prev_kprobe { + struct kprobe *kp; + unsigned int status; +}; + +/* Single step context for kprobe */ +struct kprobe_step_ctx { + unsigned long ss_pending; + unsigned long match_addr; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned int kprobe_status; + unsigned long saved_status; + struct prev_kprobe prev_kprobe; + struct kprobe_step_ctx ss_ctx; +}; + +void arch_remove_kprobe(struct kprobe *p); +int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); +bool kprobe_breakpoint_handler(struct pt_regs *regs); +bool kprobe_single_step_handler(struct pt_regs *regs); +void kretprobe_trampoline(void); +void __kprobes *trampoline_probe_handler(struct pt_regs *regs); + +#endif /* CONFIG_KPROBES */ #endif /* _ASM_RISCV_KPROBES_H */ diff --git a/arch/riscv/include/asm/probes.h b/arch/riscv/include/asm/probes.h new file mode 100644 index 000000000000..a787e6d537b9 --- /dev/null +++ b/arch/riscv/include/asm/probes.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_RISCV_PROBES_H +#define _ASM_RISCV_PROBES_H + +typedef u32 probe_opcode_t; +typedef bool (probes_handler_t) (u32 opcode, unsigned long addr, struct pt_regs *); + +/* architecture specific copy of original instruction */ +struct arch_probe_insn { + probe_opcode_t *insn; + probes_handler_t *handler; + /* restore address after simulation */ + unsigned long restore; +}; + +#ifdef CONFIG_KPROBES +typedef u32 kprobe_opcode_t; +struct arch_specific_insn { + struct arch_probe_insn api; +}; +#endif + +#endif /* _ASM_RISCV_PROBES_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 4eee315d7954..3dc0abde988a 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -30,6 +30,7 @@ obj-y += riscv_ksyms.o obj-y += stacktrace.o obj-y += cacheinfo.o obj-y += patch.o +obj-y += probes/ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_M_MODE) += traps_misaligned.o diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile new file mode 100644 index 000000000000..8a39507285a3 --- /dev/null +++ b/arch/riscv/kernel/probes/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o +obj-$(CONFIG_KPROBES) += kprobes_trampoline.o +CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c new file mode 100644 index 000000000000..0876c304ca77 --- /dev/null +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include + +#include "decode-insn.h" +#include "simulate-insn.h" + +/* Return: + * INSN_REJECTED If instruction is one not allowed to kprobe, + * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. + */ +enum probe_insn __kprobes +riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) +{ + probe_opcode_t insn = le32_to_cpu(*addr); + + /* + * Reject instructions list: + */ + RISCV_INSN_REJECTED(system, insn); + RISCV_INSN_REJECTED(fence, insn); + + /* + * Simulate instructions list: + * TODO: the REJECTED ones below need to be implemented + */ +#ifdef CONFIG_RISCV_ISA_C + RISCV_INSN_REJECTED(c_j, insn); + RISCV_INSN_REJECTED(c_jr, insn); + RISCV_INSN_REJECTED(c_jal, insn); + RISCV_INSN_REJECTED(c_jalr, insn); + RISCV_INSN_REJECTED(c_beqz, insn); + RISCV_INSN_REJECTED(c_bnez, insn); + RISCV_INSN_REJECTED(c_ebreak, insn); +#endif + + RISCV_INSN_REJECTED(auipc, insn); + RISCV_INSN_REJECTED(branch, insn); + + RISCV_INSN_SET_SIMULATE(jal, insn); + RISCV_INSN_SET_SIMULATE(jalr, insn); + + return INSN_GOOD; +} diff --git a/arch/riscv/kernel/probes/decode-insn.h b/arch/riscv/kernel/probes/decode-insn.h new file mode 100644 index 000000000000..42269a7d676d --- /dev/null +++ b/arch/riscv/kernel/probes/decode-insn.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _RISCV_KERNEL_KPROBES_DECODE_INSN_H +#define _RISCV_KERNEL_KPROBES_DECODE_INSN_H + +#include +#include + +enum probe_insn { + INSN_REJECTED, + INSN_GOOD_NO_SLOT, + INSN_GOOD, +}; + +enum probe_insn __kprobes +riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *asi); + +#endif /* _RISCV_KERNEL_KPROBES_DECODE_INSN_H */ diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c new file mode 100644 index 000000000000..e60893bd87db --- /dev/null +++ b/arch/riscv/kernel/probes/kprobes.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "decode-insn.h" + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +static void __kprobes +post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); + +static void __kprobes arch_prepare_ss_slot(struct kprobe *p) +{ + unsigned long offset = GET_INSN_LENGTH(p->opcode); + + p->ainsn.api.restore = (unsigned long)p->addr + offset; + + patch_text(p->ainsn.api.insn, p->opcode); + patch_text((void *)((unsigned long)(p->ainsn.api.insn) + offset), + __BUG_INSN_32); +} + +static void __kprobes arch_prepare_simulate(struct kprobe *p) +{ + p->ainsn.api.restore = 0; +} + +static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (p->ainsn.api.handler) + p->ainsn.api.handler((u32)p->opcode, + (unsigned long)p->addr, regs); + + post_kprobe_handler(kcb, regs); +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long probe_addr = (unsigned long)p->addr; + + if (probe_addr & 0x1) { + pr_warn("Address not aligned.\n"); + + return -EINVAL; + } + + /* copy instruction */ + p->opcode = le32_to_cpu(*p->addr); + + /* decode instruction */ + switch (riscv_probe_decode_insn(p->addr, &p->ainsn.api)) { + case INSN_REJECTED: /* insn not supported */ + return -EINVAL; + + case INSN_GOOD_NO_SLOT: /* insn need simulation */ + p->ainsn.api.insn = NULL; + break; + + case INSN_GOOD: /* instruction uses slot */ + p->ainsn.api.insn = get_insn_slot(); + if (!p->ainsn.api.insn) + return -ENOMEM; + break; + } + + /* prepare the instruction */ + if (p->ainsn.api.insn) + arch_prepare_ss_slot(p); + else + arch_prepare_simulate(p); + + return 0; +} + +/* install breakpoint in text */ +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + if ((p->opcode & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) + patch_text(p->addr, __BUG_INSN_32); + else + patch_text(p->addr, __BUG_INSN_16); +} + +/* remove breakpoint from text */ +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + patch_text(p->addr, p->opcode); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; +} + +static void __kprobes set_current_kprobe(struct kprobe *p) +{ + __this_cpu_write(current_kprobe, p); +} + +/* + * Interrupts need to be disabled before single-step mode is set, and not + * reenabled until after single-step mode ends. + * Without disabling interrupt on local CPU, there is a chance of + * interrupt occurrence in the period of exception return and start of + * out-of-line single-step, that result in wrongly single stepping + * into the interrupt handler. + */ +static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + kcb->saved_status = regs->status; + regs->status &= ~SR_SPIE; +} + +static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + regs->status = kcb->saved_status; +} + +static void __kprobes +set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr, struct kprobe *p) +{ + unsigned long offset = GET_INSN_LENGTH(p->opcode); + + kcb->ss_ctx.ss_pending = true; + kcb->ss_ctx.match_addr = addr + offset; +} + +static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) +{ + kcb->ss_ctx.ss_pending = false; + kcb->ss_ctx.match_addr = 0; +} + +static void __kprobes setup_singlestep(struct kprobe *p, + struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + unsigned long slot; + + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_REENTER; + } else { + kcb->kprobe_status = KPROBE_HIT_SS; + } + + if (p->ainsn.api.insn) { + /* prepare for single stepping */ + slot = (unsigned long)p->ainsn.api.insn; + + set_ss_context(kcb, slot, p); /* mark pending ss */ + + /* IRQs and single stepping do not mix well. */ + kprobes_save_local_irqflag(kcb, regs); + + instruction_pointer_set(regs, slot); + } else { + /* insn simulation */ + arch_simulate_insn(p, regs); + } +} + +static int __kprobes reenter_kprobe(struct kprobe *p, + struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_HIT_SS: + case KPROBE_REENTER: + pr_warn("Unrecoverable kprobe detected.\n"); + dump_kprobe(p); + BUG(); + break; + default: + WARN_ON(1); + return 0; + } + + return 1; +} + +static void __kprobes +post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + + if (!cur) + return; + + /* return addr restore if non-branching insn */ + if (cur->ainsn.api.restore != 0) + regs->epc = cur->ainsn.api.restore; + + /* restore back original saved kprobe variables and continue */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + return; + } + + /* call post handler */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) { + /* post_handler can hit breakpoint and single step + * again, so we enable D-flag for recursive exception. + */ + cur->post_handler(cur, regs, 0); + } + + reset_current_kprobe(); +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->epc = (unsigned long) cur->addr; + if (!instruction_pointer(regs)) + BUG(); + + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + } + return 0; +} + +bool __kprobes +kprobe_breakpoint_handler(struct pt_regs *regs) +{ + struct kprobe *p, *cur_kprobe; + struct kprobe_ctlblk *kcb; + unsigned long addr = instruction_pointer(regs); + + kcb = get_kprobe_ctlblk(); + cur_kprobe = kprobe_running(); + + p = get_kprobe((kprobe_opcode_t *) addr); + + if (p) { + if (cur_kprobe) { + if (reenter_kprobe(p, regs, kcb)) + return true; + } else { + /* Probe hit */ + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it will + * modify the execution path and no need to single + * stepping. Let's just reset current kprobe and exit. + * + * pre_handler can hit a breakpoint and can step thru + * before return. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); + } + return true; + } + + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Return back to original instruction, and continue. + */ + return false; +} + +bool __kprobes +kprobe_single_step_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if ((kcb->ss_ctx.ss_pending) + && (kcb->ss_ctx.match_addr == instruction_pointer(regs))) { + clear_ss_context(kcb); /* clear pending ss */ + + kprobes_restore_local_irqflag(kcb, regs); + + post_kprobe_handler(kcb, regs); + return true; + } + return false; +} + +/* + * Provide a blacklist of symbols identifying ranges which cannot be kprobed. + * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). + */ +int __init arch_populate_kprobe_blacklist(void) +{ + int ret; + + ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); + return ret; +} + +void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) +{ + return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); +} + +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *)regs->ra; + ri->fp = NULL; + regs->ra = (unsigned long) &kretprobe_trampoline; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} diff --git a/arch/riscv/kernel/probes/kprobes_trampoline.S b/arch/riscv/kernel/probes/kprobes_trampoline.S new file mode 100644 index 000000000000..6e85d021e2a2 --- /dev/null +++ b/arch/riscv/kernel/probes/kprobes_trampoline.S @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Author: Patrick Stählin + */ +#include + +#include +#include + + .text + .altmacro + + .macro save_all_base_regs + REG_S x1, PT_RA(sp) + REG_S x3, PT_GP(sp) + REG_S x4, PT_TP(sp) + REG_S x5, PT_T0(sp) + REG_S x6, PT_T1(sp) + REG_S x7, PT_T2(sp) + REG_S x8, PT_S0(sp) + REG_S x9, PT_S1(sp) + REG_S x10, PT_A0(sp) + REG_S x11, PT_A1(sp) + REG_S x12, PT_A2(sp) + REG_S x13, PT_A3(sp) + REG_S x14, PT_A4(sp) + REG_S x15, PT_A5(sp) + REG_S x16, PT_A6(sp) + REG_S x17, PT_A7(sp) + REG_S x18, PT_S2(sp) + REG_S x19, PT_S3(sp) + REG_S x20, PT_S4(sp) + REG_S x21, PT_S5(sp) + REG_S x22, PT_S6(sp) + REG_S x23, PT_S7(sp) + REG_S x24, PT_S8(sp) + REG_S x25, PT_S9(sp) + REG_S x26, PT_S10(sp) + REG_S x27, PT_S11(sp) + REG_S x28, PT_T3(sp) + REG_S x29, PT_T4(sp) + REG_S x30, PT_T5(sp) + REG_S x31, PT_T6(sp) + .endm + + .macro restore_all_base_regs + REG_L x3, PT_GP(sp) + REG_L x4, PT_TP(sp) + REG_L x5, PT_T0(sp) + REG_L x6, PT_T1(sp) + REG_L x7, PT_T2(sp) + REG_L x8, PT_S0(sp) + REG_L x9, PT_S1(sp) + REG_L x10, PT_A0(sp) + REG_L x11, PT_A1(sp) + REG_L x12, PT_A2(sp) + REG_L x13, PT_A3(sp) + REG_L x14, PT_A4(sp) + REG_L x15, PT_A5(sp) + REG_L x16, PT_A6(sp) + REG_L x17, PT_A7(sp) + REG_L x18, PT_S2(sp) + REG_L x19, PT_S3(sp) + REG_L x20, PT_S4(sp) + REG_L x21, PT_S5(sp) + REG_L x22, PT_S6(sp) + REG_L x23, PT_S7(sp) + REG_L x24, PT_S8(sp) + REG_L x25, PT_S9(sp) + REG_L x26, PT_S10(sp) + REG_L x27, PT_S11(sp) + REG_L x28, PT_T3(sp) + REG_L x29, PT_T4(sp) + REG_L x30, PT_T5(sp) + REG_L x31, PT_T6(sp) + .endm + +ENTRY(kretprobe_trampoline) + addi sp, sp, -(PT_SIZE_ON_STACK) + save_all_base_regs + + move a0, sp /* pt_regs */ + + call trampoline_probe_handler + + /* use the result as the return-address */ + move ra, a0 + + restore_all_base_regs + addi sp, sp, PT_SIZE_ON_STACK + + ret +ENDPROC(kretprobe_trampoline) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c new file mode 100644 index 000000000000..2519ce26377d --- /dev/null +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include + +#include "decode-insn.h" +#include "simulate-insn.h" + +static inline bool rv_insn_reg_get_val(struct pt_regs *regs, u32 index, + unsigned long *ptr) +{ + if (index == 0) + *ptr = 0; + else if (index <= 31) + *ptr = *((unsigned long *)regs + index); + else + return false; + + return true; +} + +static inline bool rv_insn_reg_set_val(struct pt_regs *regs, u32 index, + unsigned long val) +{ + if (index == 0) + return false; + else if (index <= 31) + *((unsigned long *)regs + index) = val; + else + return false; + + return true; +} + +bool __kprobes simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * 31 30 21 20 19 12 11 7 6 0 + * imm [20] | imm[10:1] | imm[11] | imm[19:12] | rd | opcode + * 1 10 1 8 5 JAL/J + */ + bool ret; + u32 imm; + u32 index = (opcode >> 7) & 0x1f; + + ret = rv_insn_reg_set_val(regs, index, addr + 4); + if (!ret) + return ret; + + imm = ((opcode >> 21) & 0x3ff) << 1; + imm |= ((opcode >> 20) & 0x1) << 11; + imm |= ((opcode >> 12) & 0xff) << 12; + imm |= ((opcode >> 31) & 0x1) << 20; + + instruction_pointer_set(regs, addr + sign_extend32((imm), 20)); + + return ret; +} + +bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * 31 20 19 15 14 12 11 7 6 0 + * offset[11:0] | rs1 | 010 | rd | opcode + * 12 5 3 5 JALR/JR + */ + bool ret; + unsigned long base_addr; + u32 imm = (opcode >> 20) & 0xfff; + u32 rd_index = (opcode >> 7) & 0x1f; + u32 rs1_index = (opcode >> 15) & 0x1f; + + ret = rv_insn_reg_set_val(regs, rd_index, addr + 4); + if (!ret) + return ret; + + ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); + if (!ret) + return ret; + + instruction_pointer_set(regs, (base_addr + sign_extend32((imm), 11))&~1); + + return ret; +} diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h new file mode 100644 index 000000000000..cb6ff7dccb92 --- /dev/null +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _RISCV_KERNEL_PROBES_SIMULATE_INSN_H +#define _RISCV_KERNEL_PROBES_SIMULATE_INSN_H + +#define __RISCV_INSN_FUNCS(name, mask, val) \ +static __always_inline bool riscv_insn_is_##name(probe_opcode_t code) \ +{ \ + BUILD_BUG_ON(~(mask) & (val)); \ + return (code & (mask)) == (val); \ +} \ +bool simulate_##name(u32 opcode, unsigned long addr, \ + struct pt_regs *regs) + +#define RISCV_INSN_REJECTED(name, code) \ + do { \ + if (riscv_insn_is_##name(code)) { \ + return INSN_REJECTED; \ + } \ + } while (0) + +__RISCV_INSN_FUNCS(system, 0x7f, 0x73); +__RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); + +#define RISCV_INSN_SET_SIMULATE(name, code) \ + do { \ + if (riscv_insn_is_##name(code)) { \ + api->handler = simulate_##name; \ + return INSN_GOOD_NO_SLOT; \ + } \ + } while (0) + +__RISCV_INSN_FUNCS(c_j, 0xe003, 0xa001); +__RISCV_INSN_FUNCS(c_jr, 0xf007, 0x8002); +__RISCV_INSN_FUNCS(c_jal, 0xe003, 0x2001); +__RISCV_INSN_FUNCS(c_jalr, 0xf007, 0x9002); +__RISCV_INSN_FUNCS(c_beqz, 0xe003, 0xc001); +__RISCV_INSN_FUNCS(c_bnez, 0xe003, 0xe001); +__RISCV_INSN_FUNCS(c_ebreak, 0xffff, 0x9002); + +__RISCV_INSN_FUNCS(auipc, 0x7f, 0x17); +__RISCV_INSN_FUNCS(branch, 0x7f, 0x63); + +__RISCV_INSN_FUNCS(jal, 0x7f, 0x6f); +__RISCV_INSN_FUNCS(jalr, 0x707f, 0x67); + +#endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index ad14f4466d92..19a788a271f7 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -145,6 +146,14 @@ static inline unsigned long get_break_insn_length(unsigned long pc) asmlinkage __visible void do_trap_break(struct pt_regs *regs) { +#ifdef CONFIG_KPROBES + if (kprobe_single_step_handler(regs)) + return; + + if (kprobe_breakpoint_handler(regs)) + return; +#endif + if (user_mode(regs)) force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->epc); #ifdef CONFIG_KGDB diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 33d284188f9a..1da870f33582 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -213,6 +214,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs) tsk = current; mm = tsk->mm; + if (kprobe_page_fault(regs, cause)) + return; + /* * Fault-in kernel-space virtual memory on-demand. * The 'reference' page table is init_mm.pgd. From 829adda597fef2fe133d57aac681580b2e292268 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:43 +0000 Subject: [PATCH 035/506] riscv: Add KPROBES_ON_FTRACE supported This patch adds support for kprobes on ftrace call sites to avoids much of the overhead with regular kprobes. Try it with simple steps: echo 'p:myprobe sys_clone a0=%a0 a1=%a1 stack_val=+4($stack)' > /sys/kernel/de bug/tracing/kprobe_events echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 1/1 #P:1 _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID CPU# |||| TIMESTAMP FUNCTION | | | |||| | | sh-92 [000] .... 369.899962: myprobe: (sys_clone+0x0/0x28) a0=0x1200011 a1=0x0 stack_val=0x201c20ffffffe0 cat /sys/kernel/debug/kprobes/list ffffffe00020b584 k sys_clone+0x0 [FTRACE] ^^^^^^ Signed-off-by: Guo Ren Reviewed-by: Masami Hiramatsu Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/kernel/probes/Makefile | 1 + arch/riscv/kernel/probes/ftrace.c | 53 +++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 arch/riscv/kernel/probes/ftrace.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 375f6f4d3038..07a48068567e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -72,6 +72,7 @@ config RISCV select HAVE_GENERIC_VDSO if MMU && 64BIT select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_PCI select HAVE_PERF_EVENTS diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile index 8a39507285a3..abbd1314e2fb 100644 --- a/arch/riscv/kernel/probes/Makefile +++ b/arch/riscv/kernel/probes/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o obj-$(CONFIG_KPROBES) += kprobes_trampoline.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c new file mode 100644 index 000000000000..e6372490aa0b --- /dev/null +++ b/arch/riscv/kernel/probes/ftrace.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + return; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + unsigned long orig_ip = instruction_pointer(&(regs->regs)); + + instruction_pointer_set(&(regs->regs), ip); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, &(regs->regs))) { + /* + * Emulate singlestep (and also recover regs->pc) + * as if there is a nop + */ + instruction_pointer_set(&(regs->regs), + (unsigned long)p->addr + MCOUNT_INSN_SIZE); + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, &(regs->regs), 0); + } + instruction_pointer_set(&(regs->regs), orig_ip); + } + + /* + * If pre_handler returns !0, it changes regs->pc. We have to + * skip emulating post_handler. + */ + __this_cpu_write(current_kprobe, NULL); + } +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.api.insn = NULL; + return 0; +} From 74784081aac8a0f3636965fc230e2d3b7cc123c6 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:44 +0000 Subject: [PATCH 036/506] riscv: Add uprobes supported This patch adds support for uprobes on riscv architecture. Just like kprobe, it support single-step and simulate instructions. Signed-off-by: Guo Ren Reviewed-by: Pekka Enberg Cc: Oleg Nesterov Cc: Masami Hiramatsu Cc: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 3 + arch/riscv/include/asm/processor.h | 1 + arch/riscv/include/asm/thread_info.h | 4 +- arch/riscv/include/asm/uprobes.h | 40 ++++++ arch/riscv/kernel/probes/Makefile | 1 + arch/riscv/kernel/probes/uprobes.c | 186 +++++++++++++++++++++++++++ arch/riscv/kernel/signal.c | 3 + arch/riscv/kernel/traps.c | 10 ++ arch/riscv/mm/fault.c | 6 + 9 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/uprobes.h create mode 100644 arch/riscv/kernel/probes/uprobes.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 07a48068567e..7b5bb48055e7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -160,6 +160,9 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_WANT_GENERAL_HUGETLB def_bool y +config ARCH_SUPPORTS_UPROBES + def_bool y + config SYS_SUPPORTS_HUGETLBFS depends on MMU def_bool y diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index bdddcd5c1b71..3a240037bde2 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -34,6 +34,7 @@ struct thread_struct { unsigned long sp; /* Kernel mode stack */ unsigned long s[12]; /* s[0]: frame pointer */ struct __riscv_d_ext_state fstate; + unsigned long bad_cause; }; #define INIT_THREAD { \ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 97bf5a1575d2..0e549a3089b3 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -75,6 +75,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ #define TIF_SECCOMP 8 /* syscall secure computing */ #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ +#define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -84,10 +85,11 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_WORK_MASK \ (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ - _TIF_NOTIFY_SIGNAL) + _TIF_NOTIFY_SIGNAL | _TIF_UPROBE) #define _TIF_SYSCALL_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \ diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h new file mode 100644 index 000000000000..f2183e00fdd2 --- /dev/null +++ b/arch/riscv/include/asm/uprobes.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ASM_RISCV_UPROBES_H +#define _ASM_RISCV_UPROBES_H + +#include +#include +#include + +#define MAX_UINSN_BYTES 8 + +#ifdef CONFIG_RISCV_ISA_C +#define UPROBE_SWBP_INSN __BUG_INSN_16 +#define UPROBE_SWBP_INSN_SIZE 2 +#else +#define UPROBE_SWBP_INSN __BUG_INSN_32 +#define UPROBE_SWBP_INSN_SIZE 4 +#endif +#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES + +typedef u32 uprobe_opcode_t; + +struct arch_uprobe_task { + unsigned long saved_cause; +}; + +struct arch_uprobe { + union { + u8 insn[MAX_UINSN_BYTES]; + u8 ixol[MAX_UINSN_BYTES]; + }; + struct arch_probe_insn api; + unsigned long insn_size; + bool simulate; +}; + +bool uprobe_breakpoint_handler(struct pt_regs *regs); +bool uprobe_single_step_handler(struct pt_regs *regs); + +#endif /* _ASM_RISCV_UPROBES_H */ diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile index abbd1314e2fb..7f0840dcc31b 100644 --- a/arch/riscv/kernel/probes/Makefile +++ b/arch/riscv/kernel/probes/Makefile @@ -2,4 +2,5 @@ obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o obj-$(CONFIG_KPROBES) += kprobes_trampoline.o obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o +obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o simulate-insn.o CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c new file mode 100644 index 000000000000..7a057b5f0adc --- /dev/null +++ b/arch/riscv/kernel/probes/uprobes.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include "decode-insn.h" + +#define UPROBE_TRAP_NR UINT_MAX + +bool is_swbp_insn(uprobe_opcode_t *insn) +{ +#ifdef CONFIG_RISCV_ISA_C + return (*insn & 0xffff) == UPROBE_SWBP_INSN; +#else + return *insn == UPROBE_SWBP_INSN; +#endif +} + +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + probe_opcode_t opcode; + + opcode = *(probe_opcode_t *)(&auprobe->insn[0]); + + auprobe->insn_size = GET_INSN_LENGTH(opcode); + + switch (riscv_probe_decode_insn(&opcode, &auprobe->api)) { + case INSN_REJECTED: + return -EINVAL; + + case INSN_GOOD_NO_SLOT: + auprobe->simulate = true; + break; + + case INSN_GOOD: + auprobe->simulate = false; + break; + + default: + return -EINVAL; + } + + return 0; +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + utask->autask.saved_cause = current->thread.bad_cause; + current->thread.bad_cause = UPROBE_TRAP_NR; + + instruction_pointer_set(regs, utask->xol_vaddr); + + regs->status &= ~SR_SPIE; + + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + WARN_ON_ONCE(current->thread.bad_cause != UPROBE_TRAP_NR); + + instruction_pointer_set(regs, utask->vaddr + auprobe->insn_size); + + regs->status |= SR_SPIE; + + return 0; +} + +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.bad_cause != UPROBE_TRAP_NR) + return true; + + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + probe_opcode_t insn; + unsigned long addr; + + if (!auprobe->simulate) + return false; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + addr = instruction_pointer(regs); + + if (auprobe->api.handler) + auprobe->api.handler(insn, addr, regs); + + return true; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* + * Task has received a fatal signal, so reset back to probbed + * address. + */ + instruction_pointer_set(regs, utask->vaddr); + + regs->status &= ~SR_SPIE; +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->sp <= ret->stack; + else + return regs->sp < ret->stack; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, + struct pt_regs *regs) +{ + unsigned long ra; + + ra = regs->ra; + + regs->ra = trampoline_vaddr; + + return ra; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + +bool uprobe_breakpoint_handler(struct pt_regs *regs) +{ + if (uprobe_pre_sstep_notifier(regs)) + return true; + + return false; +} + +bool uprobe_single_step_handler(struct pt_regs *regs) +{ + if (uprobe_post_sstep_notifier(regs)) + return true; + + return false; +} + +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + void *kaddr = kmap_atomic(page); + void *dst = kaddr + (vaddr & ~PAGE_MASK); + + memcpy(dst, src, len); + + /* Add ebreak behind opcode to simulate singlestep */ + if (vaddr) { + dst += GET_INSN_LENGTH(*(probe_opcode_t *)src); + *(uprobe_opcode_t *)dst = __BUG_INSN_32; + } + + kunmap_atomic(kaddr); + + /* + * We probably need flush_icache_user_page() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); +} diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 469aef8ed922..65942b3748b4 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -309,6 +309,9 @@ static void do_signal(struct pt_regs *regs) asmlinkage __visible void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) { + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + /* Handle pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 19a788a271f7..2bca2fab2d27 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -76,6 +76,8 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) static void do_trap_error(struct pt_regs *regs, int signo, int code, unsigned long addr, const char *str) { + current->thread.bad_cause = regs->cause; + if (user_mode(regs)) { do_trap(regs, signo, code, addr); } else { @@ -153,6 +155,14 @@ asmlinkage __visible void do_trap_break(struct pt_regs *regs) if (kprobe_breakpoint_handler(regs)) return; #endif +#ifdef CONFIG_UPROBES + if (uprobe_single_step_handler(regs)) + return; + + if (uprobe_breakpoint_handler(regs)) + return; +#endif + current->thread.bad_cause = regs->cause; if (user_mode(regs)) force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->epc); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 1da870f33582..8f17519208c7 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -240,6 +240,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * in an atomic region, then we must not take the fault. */ if (unlikely(faulthandler_disabled() || !mm)) { + tsk->thread.bad_cause = cause; no_context(regs, addr); return; } @@ -262,16 +263,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs) mmap_read_lock(mm); vma = find_vma(mm, addr); if (unlikely(!vma)) { + tsk->thread.bad_cause = cause; bad_area(regs, mm, code, addr); return; } if (likely(vma->vm_start <= addr)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + tsk->thread.bad_cause = cause; bad_area(regs, mm, code, addr); return; } if (unlikely(expand_stack(vma, addr))) { + tsk->thread.bad_cause = cause; bad_area(regs, mm, code, addr); return; } @@ -284,6 +288,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) code = SEGV_ACCERR; if (unlikely(access_error(cause, vma))) { + tsk->thread.bad_cause = cause; bad_area(regs, mm, code, addr); return; } @@ -317,6 +322,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) mmap_read_unlock(mm); if (unlikely(fault & VM_FAULT_ERROR)) { + tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); return; } From ee55ff803b383e03d0855661d3416aa1763e54f9 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:01:45 +0000 Subject: [PATCH 037/506] riscv: Add support for function error injection Inspired by the commit 42d038c4fb00 ("arm64: Add support for function error injection"), this patch supports function error injection for riscv. This patch mainly support two functions: one is regs_set_return_value() which is used to overwrite the return value; the another function is override_function_with_return() which is to override the probed function returning and jump to its caller. Test log: cd /sys/kernel/debug/fail_function echo sys_clone > inject echo 100 > probability echo 1 > interval ls / [ 313.176875] FAULT_INJECTION: forcing a failure. [ 313.176875] name fail_function, interval 1, probability 100, space 0, times 1 [ 313.184357] CPU: 0 PID: 87 Comm: sh Not tainted 5.8.0-rc5-00007-g6a758cc #117 [ 313.187616] Call Trace: [ 313.189100] [] walk_stackframe+0x0/0xc2 [ 313.191626] [] show_stack+0x40/0x4c [ 313.193927] [] dump_stack+0x7c/0x96 [ 313.194795] [] should_fail+0x140/0x142 [ 313.195923] [] fei_kprobe_handler+0x2c/0x5a [ 313.197687] [] kprobe_breakpoint_handler+0xb4/0x18a [ 313.200054] [] do_trap_break+0x36/0xca [ 313.202147] [] ret_from_exception+0x0/0xc [ 313.204556] [] ret_from_syscall+0x0/0x2 -sh: can't fork: Invalid argument Signed-off-by: Guo Ren Reviewed-by: Masami Hiramatsu Cc: Palmer Dabbelt Cc: Paul Walmsley Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/ptrace.h | 6 ++++++ arch/riscv/lib/Makefile | 2 ++ arch/riscv/lib/error-inject.c | 10 ++++++++++ 4 files changed, 19 insertions(+) create mode 100644 arch/riscv/lib/error-inject.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7b5bb48055e7..18d293f84318 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -67,6 +67,7 @@ config RISCV select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_EBPF_JIT if MMU + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO if MMU && 64BIT diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 23372bbcfde8..cb4abb639e8d 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -109,6 +109,12 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->a0; } +static inline void regs_set_return_value(struct pt_regs *regs, + unsigned long val) +{ + regs->a0 = val; +} + extern int regs_query_register_offset(const char *name); extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index ac6171e9c19e..25d5c9664e57 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -5,3 +5,5 @@ lib-y += memset.o lib-y += memmove.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o + +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/riscv/lib/error-inject.c b/arch/riscv/lib/error-inject.c new file mode 100644 index 000000000000..d667ade2bc41 --- /dev/null +++ b/arch/riscv/lib/error-inject.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void override_function_with_return(struct pt_regs *regs) +{ + instruction_pointer_set(regs, regs->ra); +} +NOKPROBE_SYMBOL(override_function_with_return); From fea2fed201ee5647699018a56fbb6a5e8cc053a5 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 17 Dec 2020 16:29:18 +0000 Subject: [PATCH 038/506] riscv: Enable per-task stack canaries This enables the use of per-task stack canary values if GCC has support for emitting the stack canary reference relative to the value of tp, which holds the task struct pointer in the riscv kernel. After compare arm64 and x86 implementations, seems arm64's is more flexible and readable. The key point is how gcc get the offset of stack_canary from gs/el0_sp. x86: Use a fix offset from gs, not flexible. struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ char gs_base[40]; // :( unsigned long stack_canary; }; arm64: Use -mstack-protector-guard-offset & guard-reg gcc options: -mstack-protector-guard=sysreg -mstack-protector-guard-reg=sp_el0 -mstack-protector-guard-offset=xxx riscv: Use -mstack-protector-guard-offset & guard-reg gcc options: -mstack-protector-guard=tls -mstack-protector-guard-reg=tp -mstack-protector-guard-offset=xxx GCC's implementation has been merged: commit c931e8d5a96463427040b0d11f9c4352ac22b2b0 Author: Cooper Qu Date: Mon Jul 13 16:15:08 2020 +0800 RISC-V: Add support for TLS stack protector canary access In the end, these codes are inserted by gcc before return: * 0xffffffe00020b396 <+120>: ld a5,1008(tp) # 0x3f0 * 0xffffffe00020b39a <+124>: xor a5,a5,a4 * 0xffffffe00020b39c <+126>: mv a0,s5 * 0xffffffe00020b39e <+128>: bnez a5,0xffffffe00020b61c <_do_fork+766> 0xffffffe00020b3a2 <+132>: ld ra,136(sp) 0xffffffe00020b3a4 <+134>: ld s0,128(sp) 0xffffffe00020b3a6 <+136>: ld s1,120(sp) 0xffffffe00020b3a8 <+138>: ld s2,112(sp) 0xffffffe00020b3aa <+140>: ld s3,104(sp) 0xffffffe00020b3ac <+142>: ld s4,96(sp) 0xffffffe00020b3ae <+144>: ld s5,88(sp) 0xffffffe00020b3b0 <+146>: ld s6,80(sp) 0xffffffe00020b3b2 <+148>: ld s7,72(sp) 0xffffffe00020b3b4 <+150>: addi sp,sp,144 0xffffffe00020b3b6 <+152>: ret ... * 0xffffffe00020b61c <+766>: auipc ra,0x7f8 * 0xffffffe00020b620 <+770>: jalr -1764(ra) # 0xffffffe000a02f38 <__stack_chk_fail> Signed-off-by: Guo Ren Signed-off-by: Cooper Qu Reviewed-by: Kees Cook Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 7 +++++++ arch/riscv/Makefile | 10 ++++++++++ arch/riscv/include/asm/stackprotector.h | 3 ++- arch/riscv/kernel/asm-offsets.c | 3 +++ arch/riscv/kernel/process.c | 2 +- 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 18d293f84318..c8d45da8edd9 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -449,6 +449,13 @@ config EFI allow the kernel to be booted as an EFI application. This is only useful on systems that have UEFI firmware. +config CC_HAVE_STACKPROTECTOR_TLS + def_bool $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=tp -mstack-protector-guard-offset=0) + +config STACKPROTECTOR_PER_TASK + def_bool y + depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS + endmenu config BUILTIN_DTB diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 2372464ad27d..1368d943f1f3 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -67,6 +67,16 @@ KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) # architectures. It's faster to have GCC emit only aligned accesses. KBUILD_CFLAGS += $(call cc-option,-mstrict-align) +ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y) +prepare: stack_protector_prepare +stack_protector_prepare: prepare0 + $(eval KBUILD_CFLAGS += -mstack-protector-guard=tls \ + -mstack-protector-guard-reg=tp \ + -mstack-protector-guard-offset=$(shell \ + awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}' \ + include/generated/asm-offsets.h)) +endif + # arch specific predefines for sparse CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS) diff --git a/arch/riscv/include/asm/stackprotector.h b/arch/riscv/include/asm/stackprotector.h index 5962f8891f06..09093af46565 100644 --- a/arch/riscv/include/asm/stackprotector.h +++ b/arch/riscv/include/asm/stackprotector.h @@ -24,6 +24,7 @@ static __always_inline void boot_init_stack_canary(void) canary &= CANARY_MASK; current->stack_canary = canary; - __stack_chk_guard = current->stack_canary; + if (!IS_ENABLED(CONFIG_STACKPROTECTOR_PER_TASK)) + __stack_chk_guard = current->stack_canary; } #endif /* _ASM_RISCV_STACKPROTECTOR_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index b79ffa3561fd..9ef33346853c 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -68,6 +68,9 @@ void asm_offsets(void) OFFSET(TASK_THREAD_F30, task_struct, thread.fstate.f[30]); OFFSET(TASK_THREAD_F31, task_struct, thread.fstate.f[31]); OFFSET(TASK_THREAD_FCSR, task_struct, thread.fstate.fcsr); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TSK_STACK_CANARY, task_struct, stack_canary); +#endif DEFINE(PT_SIZE, sizeof(struct pt_regs)); OFFSET(PT_EPC, pt_regs, epc); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index dd5f985b1f40..f786b5fc6bcb 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -24,7 +24,7 @@ register unsigned long gp_in_global __asm__("gp"); -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include unsigned long __stack_chk_guard __read_mostly; EXPORT_SYMBOL(__stack_chk_guard); From 091b9450858ecd60eeb5a14231db07071d24875f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 11 Jan 2021 20:40:12 +0800 Subject: [PATCH 039/506] riscv: Add dump stack in show_regs Like commit 1149aad10b1e ("arm64: Add dump_backtrace() in show_regs"), dump the stack in riscv show_regs as common code expects. Reviewed-by: Atish Patra Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/bug.h | 1 + arch/riscv/include/asm/stacktrace.h | 2 ++ arch/riscv/kernel/process.c | 9 ++++++++- arch/riscv/kernel/stacktrace.c | 10 ++++++++-- arch/riscv/kernel/traps.c | 3 ++- 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index d6f1ec08d97b..d3804a2f9aad 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -85,6 +85,7 @@ do { \ struct pt_regs; struct task_struct; +void __show_regs(struct pt_regs *regs); void die(struct pt_regs *regs, const char *str); void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr); diff --git a/arch/riscv/include/asm/stacktrace.h b/arch/riscv/include/asm/stacktrace.h index 470a65c4ccdc..3450c1912afd 100644 --- a/arch/riscv/include/asm/stacktrace.h +++ b/arch/riscv/include/asm/stacktrace.h @@ -13,5 +13,7 @@ struct stackframe { extern void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg); +extern void dump_backtrace(struct pt_regs *regs, struct task_struct *task, + const char *loglvl); #endif /* _ASM_RISCV_STACKTRACE_H */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index f786b5fc6bcb..ced48d9f4254 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,7 @@ void arch_cpu_idle(void) raw_local_irq_enable(); } -void show_regs(struct pt_regs *regs) +void __show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); @@ -69,6 +70,12 @@ void show_regs(struct pt_regs *regs) pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n", regs->status, regs->badaddr, regs->cause); } +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs); + if (!user_mode(regs)) + dump_backtrace(regs, NULL, KERN_DEFAULT); +} void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 48b870a685b3..76dadf6d396b 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -101,10 +101,16 @@ static bool print_trace_address(void *arg, unsigned long pc) return true; } +void dump_backtrace(struct pt_regs *regs, struct task_struct *task, + const char *loglvl) +{ + pr_cont("%sCall Trace:\n", loglvl); + walk_stackframe(task, regs, print_trace_address, (void *)loglvl); +} + void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { - pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); + dump_backtrace(NULL, task, loglvl); } static bool save_wchan(void *arg, unsigned long pc) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 2bca2fab2d27..3ed2c23601a0 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -67,7 +68,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) tsk->comm, task_pid_nr(tsk), signo, code, addr); print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); - show_regs(regs); + __show_regs(regs); } force_sig_fault(signo, code, (void __user *)addr); From da401e89453266c11277b42e007159a2b6ef82d3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 11 Jan 2021 20:40:13 +0800 Subject: [PATCH 040/506] riscv: Improve __show_regs Show the function symbols of epc and ra to improve the readability of crash reports, and align the printing formats about the raw epc value. Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index ced48d9f4254..19f4688f2f36 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -44,7 +44,12 @@ void __show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); - pr_cont("epc: " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n", + if (!user_mode(regs)) { + pr_cont("epc : %pS\n", (void *)regs->epc); + pr_cont(" ra : %pS\n", (void *)regs->ra); + } + + pr_cont("epc : " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n", regs->epc, regs->ra, regs->sp); pr_cont(" gp : " REG_FMT " tp : " REG_FMT " t0 : " REG_FMT "\n", regs->gp, regs->tp, regs->t0); From f766f77a74f5784d8d4d3c36b1900731f97d08d0 Mon Sep 17 00:00:00 2001 From: Chen Huang Date: Mon, 11 Jan 2021 20:40:14 +0800 Subject: [PATCH 041/506] riscv/stacktrace: Fix stack output without ra on the stack top When a function doesn't have a callee, then it will not push ra into the stack, such as lkdtm_BUG() function, addi sp,sp,-16 sd s0,8(sp) addi s0,sp,16 ebreak The struct stackframe use {fp,ra} to get information from stack, if walk_stackframe() with pr_regs, we will obtain wrong value and bad stacktrace, [] lkdtm_BUG+0x6/0x8 ---[ end trace 18da3fbdf08e25d5 ]--- Correct the next fp and pc, after that, full stacktrace shown as expects, [] lkdtm_BUG+0x6/0x8 [] lkdtm_do_action+0x14/0x1c [] direct_entry+0xc0/0x10a [] full_proxy_write+0x42/0x6a [] vfs_write+0x7e/0x214 [] ksys_write+0x98/0xc0 [] sys_write+0xe/0x16 [] ret_from_syscall+0x0/0x2 ---[ end trace 61917f3d9a9fadcd ]--- Signed-off-by: Chen Huang Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 76dadf6d396b..51ce3620ac1f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -54,9 +54,15 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, /* Unwind stack frame */ frame = (struct stackframe *)fp - 1; sp = fp; - fp = frame->fp; - pc = ftrace_graph_ret_addr(current, NULL, frame->ra, - (unsigned long *)(fp - 8)); + if (regs && (regs->epc == pc) && (frame->fp & 0x7)) { + fp = frame->ra; + pc = regs->ra; + } else { + fp = frame->fp; + pc = ftrace_graph_ret_addr(current, NULL, frame->ra, + (unsigned long *)(fp - 8)); + } + } } From a2bc9b21fd3f89b1f9a5df46427855dcf344e6e7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jan 2021 16:09:22 +0100 Subject: [PATCH 042/506] pwm: Remove ZTE ZX driver The ZTE ZX platform is getting removed, so this driver is no longer needed. Cc: Jun Nie Cc: Shawn Guo Signed-off-by: Arnd Bergmann Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/pwm-zx.txt | 22 -- drivers/pwm/Kconfig | 10 - drivers/pwm/Makefile | 1 - drivers/pwm/pwm-zx.c | 278 ------------------ 4 files changed, 311 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pwm/pwm-zx.txt delete mode 100644 drivers/pwm/pwm-zx.c diff --git a/Documentation/devicetree/bindings/pwm/pwm-zx.txt b/Documentation/devicetree/bindings/pwm/pwm-zx.txt deleted file mode 100644 index 3c8fe7aa8269..000000000000 --- a/Documentation/devicetree/bindings/pwm/pwm-zx.txt +++ /dev/null @@ -1,22 +0,0 @@ -ZTE ZX PWM controller - -Required properties: - - compatible: Should be "zte,zx296718-pwm". - - reg: Physical base address and length of the controller's registers. - - clocks : The phandle and specifier referencing the controller's clocks. - - clock-names: "pclk" for PCLK, "wclk" for WCLK to the PWM controller. The - PCLK is for register access, while WCLK is the reference clock for - calculating period and duty cycles. - - #pwm-cells: Should be 3. See pwm.yaml in this directory for a description of - the cells format. - -Example: - - pwm: pwm@1439000 { - compatible = "zte,zx296718-pwm"; - reg = <0x1439000 0x1000>; - clocks = <&lsp1crm LSP1_PWM_PCLK>, - <&lsp1crm LSP1_PWM_WCLK>; - clock-names = "pclk", "wclk"; - #pwm-cells = <3>; - }; diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 0937e1c047ac..9a4f66ae8070 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -611,14 +611,4 @@ config PWM_VT8500 To compile this driver as a module, choose M here: the module will be called pwm-vt8500. -config PWM_ZX - tristate "ZTE ZX PWM support" - depends on ARCH_ZX || COMPILE_TEST - depends on HAS_IOMEM - help - Generic PWM framework driver for ZTE ZX family SoCs. - - To compile this driver as a module, choose M here: the module - will be called pwm-zx. - endif diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index 18b89d7fd092..6374d3b1d6f3 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -57,4 +57,3 @@ obj-$(CONFIG_PWM_TIEHRPWM) += pwm-tiehrpwm.o obj-$(CONFIG_PWM_TWL) += pwm-twl.o obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o -obj-$(CONFIG_PWM_ZX) += pwm-zx.o diff --git a/drivers/pwm/pwm-zx.c b/drivers/pwm/pwm-zx.c deleted file mode 100644 index 34e91195ce98..000000000000 --- a/drivers/pwm/pwm-zx.c +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2017 Sanechips Technology Co., Ltd. - * Copyright 2017 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define ZX_PWM_MODE 0x0 -#define ZX_PWM_CLKDIV_SHIFT 2 -#define ZX_PWM_CLKDIV_MASK GENMASK(11, 2) -#define ZX_PWM_CLKDIV(x) (((x) << ZX_PWM_CLKDIV_SHIFT) & \ - ZX_PWM_CLKDIV_MASK) -#define ZX_PWM_POLAR BIT(1) -#define ZX_PWM_EN BIT(0) -#define ZX_PWM_PERIOD 0x4 -#define ZX_PWM_DUTY 0x8 - -#define ZX_PWM_CLKDIV_MAX 1023 -#define ZX_PWM_PERIOD_MAX 65535 - -struct zx_pwm_chip { - struct pwm_chip chip; - struct clk *pclk; - struct clk *wclk; - void __iomem *base; -}; - -static inline struct zx_pwm_chip *to_zx_pwm_chip(struct pwm_chip *chip) -{ - return container_of(chip, struct zx_pwm_chip, chip); -} - -static inline u32 zx_pwm_readl(struct zx_pwm_chip *zpc, unsigned int hwpwm, - unsigned int offset) -{ - return readl(zpc->base + (hwpwm + 1) * 0x10 + offset); -} - -static inline void zx_pwm_writel(struct zx_pwm_chip *zpc, unsigned int hwpwm, - unsigned int offset, u32 value) -{ - writel(value, zpc->base + (hwpwm + 1) * 0x10 + offset); -} - -static void zx_pwm_set_mask(struct zx_pwm_chip *zpc, unsigned int hwpwm, - unsigned int offset, u32 mask, u32 value) -{ - u32 data; - - data = zx_pwm_readl(zpc, hwpwm, offset); - data &= ~mask; - data |= value & mask; - zx_pwm_writel(zpc, hwpwm, offset, data); -} - -static void zx_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) -{ - struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); - unsigned long rate; - unsigned int div; - u32 value; - u64 tmp; - - value = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_MODE); - - if (value & ZX_PWM_POLAR) - state->polarity = PWM_POLARITY_NORMAL; - else - state->polarity = PWM_POLARITY_INVERSED; - - if (value & ZX_PWM_EN) - state->enabled = true; - else - state->enabled = false; - - div = (value & ZX_PWM_CLKDIV_MASK) >> ZX_PWM_CLKDIV_SHIFT; - rate = clk_get_rate(zpc->wclk); - - tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_PERIOD); - tmp *= div * NSEC_PER_SEC; - state->period = DIV_ROUND_CLOSEST_ULL(tmp, rate); - - tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_DUTY); - tmp *= div * NSEC_PER_SEC; - state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, rate); -} - -static int zx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - unsigned int duty_ns, unsigned int period_ns) -{ - struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); - unsigned int period_cycles, duty_cycles; - unsigned long long c; - unsigned int div = 1; - unsigned long rate; - - /* Find out the best divider */ - rate = clk_get_rate(zpc->wclk); - - while (1) { - c = rate / div; - c = c * period_ns; - do_div(c, NSEC_PER_SEC); - - if (c < ZX_PWM_PERIOD_MAX) - break; - - div++; - - if (div > ZX_PWM_CLKDIV_MAX) - return -ERANGE; - } - - /* Calculate duty cycles */ - period_cycles = c; - c *= duty_ns; - do_div(c, period_ns); - duty_cycles = c; - - /* - * If the PWM is being enabled, we have to temporarily disable it - * before configuring the registers. - */ - if (pwm_is_enabled(pwm)) - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_EN, 0); - - /* Set up registers */ - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_CLKDIV_MASK, - ZX_PWM_CLKDIV(div)); - zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_PERIOD, period_cycles); - zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_DUTY, duty_cycles); - - /* Re-enable the PWM if needed */ - if (pwm_is_enabled(pwm)) - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, - ZX_PWM_EN, ZX_PWM_EN); - - return 0; -} - -static int zx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - const struct pwm_state *state) -{ - struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); - struct pwm_state cstate; - int ret; - - pwm_get_state(pwm, &cstate); - - if (state->polarity != cstate.polarity) - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_POLAR, - (state->polarity == PWM_POLARITY_INVERSED) ? - 0 : ZX_PWM_POLAR); - - if (state->period != cstate.period || - state->duty_cycle != cstate.duty_cycle) { - ret = zx_pwm_config(chip, pwm, state->duty_cycle, - state->period); - if (ret) - return ret; - } - - if (state->enabled != cstate.enabled) { - if (state->enabled) { - ret = clk_prepare_enable(zpc->wclk); - if (ret) - return ret; - - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, - ZX_PWM_EN, ZX_PWM_EN); - } else { - zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, - ZX_PWM_EN, 0); - clk_disable_unprepare(zpc->wclk); - } - } - - return 0; -} - -static const struct pwm_ops zx_pwm_ops = { - .apply = zx_pwm_apply, - .get_state = zx_pwm_get_state, - .owner = THIS_MODULE, -}; - -static int zx_pwm_probe(struct platform_device *pdev) -{ - struct zx_pwm_chip *zpc; - unsigned int i; - int ret; - - zpc = devm_kzalloc(&pdev->dev, sizeof(*zpc), GFP_KERNEL); - if (!zpc) - return -ENOMEM; - - zpc->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(zpc->base)) - return PTR_ERR(zpc->base); - - zpc->pclk = devm_clk_get(&pdev->dev, "pclk"); - if (IS_ERR(zpc->pclk)) - return PTR_ERR(zpc->pclk); - - zpc->wclk = devm_clk_get(&pdev->dev, "wclk"); - if (IS_ERR(zpc->wclk)) - return PTR_ERR(zpc->wclk); - - ret = clk_prepare_enable(zpc->pclk); - if (ret) - return ret; - - zpc->chip.dev = &pdev->dev; - zpc->chip.ops = &zx_pwm_ops; - zpc->chip.base = -1; - zpc->chip.npwm = 4; - zpc->chip.of_xlate = of_pwm_xlate_with_flags; - zpc->chip.of_pwm_n_cells = 3; - - /* - * PWM devices may be enabled by firmware, and let's disable all of - * them initially to save power. - */ - for (i = 0; i < zpc->chip.npwm; i++) - zx_pwm_set_mask(zpc, i, ZX_PWM_MODE, ZX_PWM_EN, 0); - - ret = pwmchip_add(&zpc->chip); - if (ret < 0) { - dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret); - clk_disable_unprepare(zpc->pclk); - return ret; - } - - platform_set_drvdata(pdev, zpc); - - return 0; -} - -static int zx_pwm_remove(struct platform_device *pdev) -{ - struct zx_pwm_chip *zpc = platform_get_drvdata(pdev); - int ret; - - ret = pwmchip_remove(&zpc->chip); - clk_disable_unprepare(zpc->pclk); - - return ret; -} - -static const struct of_device_id zx_pwm_dt_ids[] = { - { .compatible = "zte,zx296718-pwm", }, - { /* sentinel */ } -}; -MODULE_DEVICE_TABLE(of, zx_pwm_dt_ids); - -static struct platform_driver zx_pwm_driver = { - .driver = { - .name = "zx-pwm", - .of_match_table = zx_pwm_dt_ids, - }, - .probe = zx_pwm_probe, - .remove = zx_pwm_remove, -}; -module_platform_driver(zx_pwm_driver); - -MODULE_ALIAS("platform:zx-pwm"); -MODULE_AUTHOR("Shawn Guo "); -MODULE_DESCRIPTION("ZTE ZX PWM Driver"); -MODULE_LICENSE("GPL v2"); From d9b657a5cdbd960de35dee7e06473caf44a9016f Mon Sep 17 00:00:00 2001 From: Simon South Date: Tue, 19 Jan 2021 11:12:05 -0500 Subject: [PATCH 043/506] pwm: rockchip: Enable APB clock during register access while probing Commit 457f74abbed0 ("pwm: rockchip: Keep enabled PWMs running while probing") modified rockchip_pwm_probe() to access a PWM device's registers directly to check whether or not the device is enabled, but did not also change the function so it first enables the device's APB clock to be certain the device can respond. This risks hanging the kernel on systems with PWM devices that use more than a single clock. Avoid this by enabling the device's APB clock before accessing its registers (and disabling the clock when register access is complete). Fixes: 457f74abbed0 ("pwm: rockchip: Keep enabled PWMs running while probing") Reported-by: Thierry Reding Suggested-by: Trent Piepho Signed-off-by: Simon South Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 389a5e140412..e6929bc73968 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -330,9 +330,9 @@ static int rockchip_pwm_probe(struct platform_device *pdev) return ret; } - ret = clk_prepare(pc->pclk); + ret = clk_prepare_enable(pc->pclk); if (ret) { - dev_err(&pdev->dev, "Can't prepare APB clk: %d\n", ret); + dev_err(&pdev->dev, "Can't prepare enable APB clk: %d\n", ret); goto err_clk; } @@ -362,10 +362,12 @@ static int rockchip_pwm_probe(struct platform_device *pdev) if ((ctrl & enable_conf) != enable_conf) clk_disable(pc->clk); + clk_disable(pc->pclk); + return 0; err_pclk: - clk_unprepare(pc->pclk); + clk_disable_unprepare(pc->pclk); err_clk: clk_disable_unprepare(pc->clk); From d5d8d675865ccddfe4da26c85f22c55cec663bf2 Mon Sep 17 00:00:00 2001 From: Simon South Date: Tue, 19 Jan 2021 11:12:06 -0500 Subject: [PATCH 044/506] pwm: rockchip: rockchip_pwm_probe(): Remove superfluous clk_unprepare() If rockchip_pwm_probe() fails to register a PWM device it calls clk_unprepare() for the device's PWM clock, without having first disabled the clock and before jumping to an error handler that also unprepares it. This is likely to produce warnings from the kernel about the clock being unprepared when it is still enabled, and then being unprepared when it has already been unprepared. Prevent these warnings by removing this unnecessary call to clk_unprepare(). Fixes: 48cf973cae33 ("pwm: rockchip: Avoid glitches on already running PWMs") Signed-off-by: Simon South Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index e6929bc73968..90f969f9f5e2 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -351,7 +351,6 @@ static int rockchip_pwm_probe(struct platform_device *pdev) ret = pwmchip_add(&pc->chip); if (ret < 0) { - clk_unprepare(pc->clk); dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); goto err_pclk; } From c9f809d0db69572f360c3b59d611bb6f06498ac9 Mon Sep 17 00:00:00 2001 From: Simon South Date: Tue, 19 Jan 2021 11:12:07 -0500 Subject: [PATCH 045/506] pwm: rockchip: Replace "bus clk" with "PWM clk" Clarify the Rockchip PWM driver's error messages by referring to the clock that operates a PWM device as the "PWM" clock, matching its name in the device tree, rather than the "bus" clock (which is especially misleading in the case of devices that also use a separate clock for bus access). Suggested-by: Robin Murphy Signed-off-by: Simon South Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 90f969f9f5e2..b5bab427b5de 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -307,7 +307,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev) pc->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(pc->clk)) return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk), - "Can't get bus clk\n"); + "Can't get PWM clk\n"); } count = of_count_phandle_with_args(pdev->dev.of_node, @@ -326,7 +326,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev) ret = clk_prepare_enable(pc->clk); if (ret) { - dev_err(&pdev->dev, "Can't prepare enable bus clk: %d\n", ret); + dev_err(&pdev->dev, "Can't prepare enable PWM clk: %d\n", ret); return ret; } From d21ba5d6217bd5a6a696678385906ed1994b380b Mon Sep 17 00:00:00 2001 From: Simon South Date: Tue, 19 Jan 2021 11:12:08 -0500 Subject: [PATCH 046/506] pwm: rockchip: Eliminate potential race condition when probing Commit 48cf973cae33 ("pwm: rockchip: Avoid glitches on already running PWMs") introduced a potential race condition in rockchip_pwm_probe(): A consumer could enable an inactive PWM, or disable a running one, between rockchip_pwm_probe() registering the device via pwmchip_add() and checking whether it is enabled (to determine whether it was started by a bootloader). This could result in a device's PWM clock being either enabled once more than necessary, potentially causing it to continue running when no longer needed, or disabled once more than necessary, producing a warning from the kernel. Eliminate these possibilities by modifying rockchip_pwm_probe() so it checks whether a device is enabled before registering it rather than after. Fixes: 48cf973cae33 ("pwm: rockchip: Avoid glitches on already running PWMs") Reported-by: Trent Piepho Signed-off-by: Simon South Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index b5bab427b5de..228147e9bc6e 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -288,6 +288,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev) const struct of_device_id *id; struct rockchip_pwm_chip *pc; u32 enable_conf, ctrl; + bool enabled; int ret, count; id = of_match_device(rockchip_pwm_dt_ids, &pdev->dev); @@ -349,6 +350,10 @@ static int rockchip_pwm_probe(struct platform_device *pdev) pc->chip.of_pwm_n_cells = 3; } + enable_conf = pc->data->enable_conf; + ctrl = readl_relaxed(pc->base + pc->data->regs.ctrl); + enabled = (ctrl & enable_conf) == enable_conf; + ret = pwmchip_add(&pc->chip); if (ret < 0) { dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); @@ -356,9 +361,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev) } /* Keep the PWM clk enabled if the PWM appears to be up and running. */ - enable_conf = pc->data->enable_conf; - ctrl = readl_relaxed(pc->base + pc->data->regs.ctrl); - if ((ctrl & enable_conf) != enable_conf) + if (!enabled) clk_disable(pc->clk); clk_disable(pc->pclk); From 11be938ae003c1cf157f0e80d5c9ad29a0f4c34c Mon Sep 17 00:00:00 2001 From: Simon South Date: Tue, 19 Jan 2021 11:12:09 -0500 Subject: [PATCH 047/506] pwm: rockchip: Enable clock before calling clk_get_rate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documentation for clk_get_rate() in include/linux/clk.h states the function's result is valid only for a clock source that has been enabled. However, the Rockchip PWM driver uses this function in two places to query the rate of a clock without first ensuring it is enabled. Fix this by modifying rockchip_pwm_get_state() and rockchip_pwm_apply() so they enable a device's PWM clock before querying its rate (in the latter case, the querying is actually done in rockchip_pwm_config()) and disable the clock again before returning. Reported-by: Uwe Kleine-König Signed-off-by: Simon South Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 228147e9bc6e..6ad7d0a50aed 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -72,6 +72,10 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip, if (ret) return; + ret = clk_enable(pc->clk); + if (ret) + return; + clk_rate = clk_get_rate(pc->clk); tmp = readl_relaxed(pc->base + pc->data->regs.period); @@ -90,6 +94,7 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip, else state->polarity = PWM_POLARITY_NORMAL; + clk_disable(pc->clk); clk_disable(pc->pclk); } @@ -189,6 +194,10 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, if (ret) return ret; + ret = clk_enable(pc->clk); + if (ret) + return ret; + pwm_get_state(pwm, &curstate); enabled = curstate.enabled; @@ -208,6 +217,7 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, } out: + clk_disable(pc->clk); clk_disable(pc->pclk); return ret; From d1eb86e59be09c12447fcb959783cbc70a9bec01 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 29 Jan 2021 14:15:48 +0800 Subject: [PATCH 048/506] ACPI: tables: introduce support for FPDT table ACPI Firmware Performance Data Table (FPDT) provides information about firmware performance during system boot, S3 suspend and S3 resume. Have the kernel parse the FPDT table, and expose the firmware performance data to userspace as sysfs attributes under /sys/firmware/acpi/fpdt/. Tested-by: Todd Brandt Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-firmware-acpi | 43 +++ drivers/acpi/Kconfig | 8 + drivers/acpi/Makefile | 1 + drivers/acpi/acpi_fpdt.c | 264 ++++++++++++++++++ 4 files changed, 316 insertions(+) create mode 100644 drivers/acpi/acpi_fpdt.c diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi index b16d30a71709..819939d858c9 100644 --- a/Documentation/ABI/testing/sysfs-firmware-acpi +++ b/Documentation/ABI/testing/sysfs-firmware-acpi @@ -1,3 +1,46 @@ +What: /sys/firmware/acpi/fpdt/ +Date: Jan 2021 +Contact: Zhang Rui +Description: + ACPI Firmware Performance Data Table (FPDT) provides + information for firmware performance data for system boot, + S3 suspend and S3 resume. This sysfs entry contains the + performance data retrieved from the FPDT. + + boot: + firmware_start_ns: Timer value logged at the beginning + of firmware image execution. In nanoseconds. + bootloader_load_ns: Timer value logged just prior to + loading the OS boot loader into memory. + In nanoseconds. + bootloader_launch_ns: Timer value logged just prior to + launching the currently loaded OS boot loader + image. In nanoseconds. + exitbootservice_start_ns: Timer value logged at the + point when the OS loader calls the + ExitBootServices function for UEFI compatible + firmware. In nanoseconds. + exitbootservice_end_ns: Timer value logged at the point + just prior to the OS loader gaining control + back from the ExitBootServices function for + UEFI compatible firmware. In nanoseconds. + suspend: + suspend_start_ns: Timer value recorded at the previous + OS write to SLP_TYP upon entry to S3. In + nanoseconds. + suspend_end_ns: Timer value recorded at the previous + firmware write to SLP_TYP used to trigger + hardware entry to S3. In nanoseconds. + resume: + resume_count: A count of the number of S3 resume cycles + since the last full boot sequence. + resume_avg_ns: Average timer value of all resume cycles + logged since the last full boot sequence, + including the most recent resume. In nanoseconds. + resume_prev_ns: Timer recorded at the end of the previous + platform runtime firmware S3 resume, just prior to + handoff to the OS waking vector. In nanoseconds. + What: /sys/firmware/acpi/bgrt/ Date: January 2012 Contact: Matthew Garrett diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ebcf534514be..cdfe5c75aa53 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -87,6 +87,14 @@ config ACPI_SPCR_TABLE This table provides information about the configuration of the earlycon console. +config ACPI_FPDT + bool "ACPI Firmware Performance Data Table (FPDT) support" + depends on X86_64 + help + Enable support for the Firmware Performance Data Table (FPDT). + This table provides information on the timing of the system + boot, S3 suspend and S3 resume firmware code paths. + config ACPI_LPIT bool depends on X86_64 diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 076894a3330f..eb93bb7b6479 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -57,6 +57,7 @@ acpi-$(CONFIG_X86) += x86/utils.o acpi-$(CONFIG_X86) += x86/s2idle.o acpi-$(CONFIG_DEBUG_FS) += debugfs.o acpi-y += acpi_lpat.o +acpi-$(CONFIG_ACPI_FPDT) += acpi_fpdt.o acpi-$(CONFIG_ACPI_LPIT) += acpi_lpit.o acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o acpi-$(CONFIG_ACPI_WATCHDOG) += acpi_watchdog.o diff --git a/drivers/acpi/acpi_fpdt.c b/drivers/acpi/acpi_fpdt.c new file mode 100644 index 000000000000..a89a806a7a2a --- /dev/null +++ b/drivers/acpi/acpi_fpdt.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * FPDT support for exporting boot and suspend/resume performance data + * + * Copyright (C) 2021 Intel Corporation. All rights reserved. + */ + +#define pr_fmt(fmt) "ACPI FPDT: " fmt + +#include + +/* + * FPDT contains ACPI table header and a number of fpdt_subtable_entries. + * Each fpdt_subtable_entry points to a subtable: FBPT or S3PT. + * Each FPDT subtable (FBPT/S3PT) is composed of a fpdt_subtable_header + * and a number of fpdt performance records. + * Each FPDT performance record is composed of a fpdt_record_header and + * performance data fields, for boot or suspend or resume phase. + */ +enum fpdt_subtable_type { + SUBTABLE_FBPT, + SUBTABLE_S3PT, +}; + +struct fpdt_subtable_entry { + u16 type; /* refer to enum fpdt_subtable_type */ + u8 length; + u8 revision; + u32 reserved; + u64 address; /* physical address of the S3PT/FBPT table */ +}; + +struct fpdt_subtable_header { + u32 signature; + u32 length; +}; + +enum fpdt_record_type { + RECORD_S3_RESUME, + RECORD_S3_SUSPEND, + RECORD_BOOT, +}; + +struct fpdt_record_header { + u16 type; /* refer to enum fpdt_record_type */ + u8 length; + u8 revision; +}; + +struct resume_performance_record { + struct fpdt_record_header header; + u32 resume_count; + u64 resume_prev; + u64 resume_avg; +} __attribute__((packed)); + +struct boot_performance_record { + struct fpdt_record_header header; + u32 reserved; + u64 firmware_start; + u64 bootloader_load; + u64 bootloader_launch; + u64 exitbootservice_start; + u64 exitbootservice_end; +} __attribute__((packed)); + +struct suspend_performance_record { + struct fpdt_record_header header; + u64 suspend_start; + u64 suspend_end; +} __attribute__((packed)); + + +static struct resume_performance_record *record_resume; +static struct suspend_performance_record *record_suspend; +static struct boot_performance_record *record_boot; + +#define FPDT_ATTR(phase, name) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%llu\n", record_##phase->name); \ +} \ +static struct kobj_attribute name##_attr = \ +__ATTR(name##_ns, 0444, name##_show, NULL) + +FPDT_ATTR(resume, resume_prev); +FPDT_ATTR(resume, resume_avg); +FPDT_ATTR(suspend, suspend_start); +FPDT_ATTR(suspend, suspend_end); +FPDT_ATTR(boot, firmware_start); +FPDT_ATTR(boot, bootloader_load); +FPDT_ATTR(boot, bootloader_launch); +FPDT_ATTR(boot, exitbootservice_start); +FPDT_ATTR(boot, exitbootservice_end); + +static ssize_t resume_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", record_resume->resume_count); +} + +static struct kobj_attribute resume_count_attr = +__ATTR_RO(resume_count); + +static struct attribute *resume_attrs[] = { + &resume_count_attr.attr, + &resume_prev_attr.attr, + &resume_avg_attr.attr, + NULL +}; + +static const struct attribute_group resume_attr_group = { + .attrs = resume_attrs, + .name = "resume", +}; + +static struct attribute *suspend_attrs[] = { + &suspend_start_attr.attr, + &suspend_end_attr.attr, + NULL +}; + +static const struct attribute_group suspend_attr_group = { + .attrs = suspend_attrs, + .name = "suspend", +}; + +static struct attribute *boot_attrs[] = { + &firmware_start_attr.attr, + &bootloader_load_attr.attr, + &bootloader_launch_attr.attr, + &exitbootservice_start_attr.attr, + &exitbootservice_end_attr.attr, + NULL +}; + +static const struct attribute_group boot_attr_group = { + .attrs = boot_attrs, + .name = "boot", +}; + +static struct kobject *fpdt_kobj; + +static int fpdt_process_subtable(u64 address, u32 subtable_type) +{ + struct fpdt_subtable_header *subtable_header; + struct fpdt_record_header *record_header; + char *signature = (subtable_type == SUBTABLE_FBPT ? "FBPT" : "S3PT"); + u32 length, offset; + int result; + + subtable_header = acpi_os_map_memory(address, sizeof(*subtable_header)); + if (!subtable_header) + return -ENOMEM; + + if (strncmp((char *)&subtable_header->signature, signature, 4)) { + pr_info(FW_BUG "subtable signature and type mismatch!\n"); + return -EINVAL; + } + + length = subtable_header->length; + acpi_os_unmap_memory(subtable_header, sizeof(*subtable_header)); + + subtable_header = acpi_os_map_memory(address, length); + if (!subtable_header) + return -ENOMEM; + + offset = sizeof(*subtable_header); + while (offset < length) { + record_header = (void *)subtable_header + offset; + offset += record_header->length; + + switch (record_header->type) { + case RECORD_S3_RESUME: + if (subtable_type != SUBTABLE_S3PT) { + pr_err(FW_BUG "Invalid record %d for subtable %s\n", + record_header->type, signature); + return -EINVAL; + } + if (record_resume) { + pr_err("Duplicate resume performance record found.\n"); + continue; + } + record_resume = (struct resume_performance_record *)record_header; + result = sysfs_create_group(fpdt_kobj, &resume_attr_group); + if (result) + return result; + break; + case RECORD_S3_SUSPEND: + if (subtable_type != SUBTABLE_S3PT) { + pr_err(FW_BUG "Invalid %d for subtable %s\n", + record_header->type, signature); + continue; + } + if (record_suspend) { + pr_err("Duplicate suspend performance record found.\n"); + continue; + } + record_suspend = (struct suspend_performance_record *)record_header; + result = sysfs_create_group(fpdt_kobj, &suspend_attr_group); + if (result) + return result; + break; + case RECORD_BOOT: + if (subtable_type != SUBTABLE_FBPT) { + pr_err(FW_BUG "Invalid %d for subtable %s\n", + record_header->type, signature); + return -EINVAL; + } + if (record_boot) { + pr_err("Duplicate boot performance record found.\n"); + continue; + } + record_boot = (struct boot_performance_record *)record_header; + result = sysfs_create_group(fpdt_kobj, &boot_attr_group); + if (result) + return result; + break; + + default: + pr_err(FW_BUG "Invalid record %d found.\n", record_header->type); + return -EINVAL; + } + } + return 0; +} + +static int __init acpi_init_fpdt(void) +{ + acpi_status status; + struct acpi_table_header *header; + struct fpdt_subtable_entry *subtable; + u32 offset = sizeof(*header); + + status = acpi_get_table(ACPI_SIG_FPDT, 0, &header); + + if (ACPI_FAILURE(status)) + return 0; + + fpdt_kobj = kobject_create_and_add("fpdt", acpi_kobj); + if (!fpdt_kobj) + return -ENOMEM; + + while (offset < header->length) { + subtable = (void *)header + offset; + switch (subtable->type) { + case SUBTABLE_FBPT: + case SUBTABLE_S3PT: + fpdt_process_subtable(subtable->address, + subtable->type); + break; + default: + pr_info(FW_BUG "Invalid subtable type %d found.\n", + subtable->type); + break; + } + offset += sizeof(*subtable); + } + return 0; +} + +fs_initcall(acpi_init_fpdt); From fe7952c629daec6fb71f55376cb2ed543fde3186 Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Fri, 8 Jan 2021 23:45:30 +0530 Subject: [PATCH 049/506] drm/msm: Add speed-bin support to a618 gpu Some GPUs support different max frequencies depending on the platform. To identify the correct variant, we should check the gpu speedbin fuse value. Add support for this speedbin detection to a6xx family along with the required fuse details for a618 gpu. Signed-off-by: Akhil P Oommen Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 2 + 2 files changed, 85 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 130661898546..499d134f2ef3 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -10,6 +10,7 @@ #include #include +#include #include #define GPU_PAS_ID 13 @@ -1208,6 +1209,10 @@ static void a6xx_destroy(struct msm_gpu *gpu) a6xx_gmu_remove(a6xx_gpu); adreno_gpu_cleanup(adreno_gpu); + + if (a6xx_gpu->opp_table) + dev_pm_opp_put_supported_hw(a6xx_gpu->opp_table); + kfree(a6xx_gpu); } @@ -1264,6 +1269,78 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR); } +static u32 a618_get_speed_bin(u32 fuse) +{ + if (fuse == 0) + return 0; + else if (fuse == 169) + return 1; + else if (fuse == 174) + return 2; + + return UINT_MAX; +} + +static u32 fuse_to_supp_hw(struct device *dev, u32 revn, u32 fuse) +{ + u32 val = UINT_MAX; + + if (revn == 618) + val = a618_get_speed_bin(fuse); + + if (val == UINT_MAX) { + DRM_DEV_ERROR(dev, + "missing support for speed-bin: %u. Some OPPs may not be supported by hardware", + fuse); + return UINT_MAX; + } + + return (1 << val); +} + +static int a6xx_set_supported_hw(struct device *dev, struct a6xx_gpu *a6xx_gpu, + u32 revn) +{ + struct opp_table *opp_table; + struct nvmem_cell *cell; + u32 supp_hw = UINT_MAX; + void *buf; + + cell = nvmem_cell_get(dev, "speed_bin"); + /* + * -ENOENT means that the platform doesn't support speedbin which is + * fine + */ + if (PTR_ERR(cell) == -ENOENT) + return 0; + else if (IS_ERR(cell)) { + DRM_DEV_ERROR(dev, + "failed to read speed-bin. Some OPPs may not be supported by hardware"); + goto done; + } + + buf = nvmem_cell_read(cell, NULL); + if (IS_ERR(buf)) { + nvmem_cell_put(cell); + DRM_DEV_ERROR(dev, + "failed to read speed-bin. Some OPPs may not be supported by hardware"); + goto done; + } + + supp_hw = fuse_to_supp_hw(dev, revn, *((u32 *) buf)); + + kfree(buf); + nvmem_cell_put(cell); + +done: + opp_table = dev_pm_opp_set_supported_hw(dev, &supp_hw, 1); + if (IS_ERR(opp_table)) + return PTR_ERR(opp_table); + + a6xx_gpu->opp_table = opp_table; + return 0; +} + static const struct adreno_gpu_funcs funcs = { .base = { .get_param = adreno_get_param, @@ -1325,6 +1402,12 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) a6xx_llc_slices_init(pdev, a6xx_gpu); + ret = a6xx_set_supported_hw(&pdev->dev, a6xx_gpu, info->revn); + if (ret) { + a6xx_destroy(&(a6xx_gpu->base.base)); + return ERR_PTR(ret); + } + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) { a6xx_destroy(&(a6xx_gpu->base.base)); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index e793d329e77b..ce0610c5256f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -33,6 +33,8 @@ struct a6xx_gpu { void *llc_slice; void *htw_llc_slice; bool have_mmu500; + + struct opp_table *opp_table; }; #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) From 6cefa31e810404dafdfcdb94874146cea11626c2 Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Sat, 2 Jan 2021 22:24:37 +0200 Subject: [PATCH 050/506] drm/msm: Fix MSM_INFO_GET_IOVA with carveout The msm_gem_get_iova should be guarded with gpu != NULL and not aspace != NULL, because aspace is NULL when using vram carveout. Fixes: 933415e24bd0d ("drm/msm: Add support for private address space instances") Signed-off-by: Iskren Chernev Tested-by: Alexey Minnekhanov Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 108c405e03dd..94525ac76d4e 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -788,9 +788,10 @@ static int msm_ioctl_gem_info_iova(struct drm_device *dev, struct drm_file *file, struct drm_gem_object *obj, uint64_t *iova) { + struct msm_drm_private *priv = dev->dev_private; struct msm_file_private *ctx = file->driver_priv; - if (!ctx->aspace) + if (!priv->gpu) return -EINVAL; /* From 4f2cf99d542c4070d5def1db20ce6436e5f92e53 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Sat, 9 Jan 2021 14:40:44 +0100 Subject: [PATCH 051/506] drm/msm/a5xx: Allow all patchid for A540 chip On at least MSM8998 it's possible to find Adreno 540.0 and 540.1 but I have never found any 540.2. In any case, the patchids 0-1 for A540 are completely supported by this driver and there is no reason to disallow probing them (as they also share the same firmware names). Besides that, the patchid number is also used in the a5xx_power.c function a540_lm_setup to disable the battery current limiter, which makes faking the Adreno patchid to .2 (which would anyway be sad) useless and even producing breakages. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 12e75ba360f9..ce9345cc29a4 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -168,7 +168,7 @@ static const struct adreno_info gpulist[] = { .init = a5xx_gpu_init, .zapfw = "a530_zap.mdt", }, { - .rev = ADRENO_REV(5, 4, 0, 2), + .rev = ADRENO_REV(5, 4, 0, ANY_ID), .revn = 540, .name = "A540", .fw = { From 276619c0923f8fa6a82e60edb88a82468645362d Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Mon, 11 Jan 2021 17:34:08 +0530 Subject: [PATCH 052/506] drm/msm: Add proper checks for GPU LLCC support Domain attribute setting for LLCC is guarded by !IS_ERR check which works fine only when CONFIG_QCOM_LLCC=y but when it is disabled, the LLCC apis return NULL and that is not handled by IS_ERR check. Due to this, domain attribute for LLCC will be set even on GPUs which do not support it and cause issues, so correct this by using IS_ERR_OR_NULL checks appropriately. Meanwhile also cleanup comment block and remove unwanted blank line. Fixes: 00fd44a1a470 ("drm/msm: Only enable A6xx LLCC code on A6xx") Fixes: 474dadb8b0d5 ("drm/msm/a6xx: Add support for using system cache(LLC)") Signed-off-by: Sai Prakash Ranjan Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 +- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 499d134f2ef3..3a64b4557478 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1118,7 +1118,7 @@ static void a6xx_llc_slices_init(struct platform_device *pdev, a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); - if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice)) + if (IS_ERR_OR_NULL(a6xx_gpu->llc_slice) && IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice)) a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL); } diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index f09175698827..b35914de1b27 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -200,15 +200,15 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL; - if (adreno_is_a6xx(adreno_gpu)) { struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct io_pgtable_domain_attr pgtbl_cfg; + /* - * This allows GPU to set the bus attributes required to use system - * cache on behalf of the iommu page table walker. - */ - if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { + * This allows GPU to set the bus attributes required to use system + * cache on behalf of the iommu page table walker. + */ + if (!IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice)) { pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); } From 45596f254061dae8ee04021c7146b362501a02ee Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Mon, 11 Jan 2021 17:34:09 +0530 Subject: [PATCH 053/506] drm/msm/a6xx: Create an A6XX GPU specific address space A6XX GPUs have support for last level cache(LLC) also known as system cache and need to set the bus attributes to use it. Currently we use a generic adreno iommu address space implementation which are also used by older GPU generations which do not have LLC and might introduce issues accidentally and is not clean in a way that anymore additions of GPUs supporting LLC would have to be guarded under ifdefs. So keep the generic code separate and make the address space creation A6XX specific. We also have a helper to set the llc attributes so that if the newer GPU generations do support them, we can use it instead of open coding domain attribute setting for each GPU. Signed-off-by: Sai Prakash Ranjan Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 ++++++++++++++++++++++++- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 23 +++++-------- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 7 ++-- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 3a64b4557478..0a51cad42642 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1244,6 +1244,50 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu) return (unsigned long)busy_time; } +static struct msm_gem_address_space * +a6xx_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + struct iommu_domain *iommu; + struct msm_mmu *mmu; + struct msm_gem_address_space *aspace; + u64 start, size; + + iommu = iommu_domain_alloc(&platform_bus_type); + if (!iommu) + return NULL; + + /* + * This allows GPU to set the bus attributes required to use system + * cache on behalf of the iommu page table walker. + */ + if (!IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice)) + adreno_set_llc_attributes(iommu); + + mmu = msm_iommu_new(&pdev->dev, iommu); + if (IS_ERR(mmu)) { + iommu_domain_free(iommu); + return ERR_CAST(mmu); + } + + /* + * Use the aperture start or SZ_16M, whichever is greater. This will + * ensure that we align with the allocated pagetable range while still + * allowing room in the lower 32 bits for GMEM and whatnot + */ + start = max_t(u64, SZ_16M, iommu->geometry.aperture_start); + size = iommu->geometry.aperture_end - start + 1; + + aspace = msm_gem_address_space_create(mmu, "gpu", + start & GENMASK_ULL(48, 0), size); + + if (IS_ERR(aspace) && !IS_ERR(mmu)) + mmu->funcs->destroy(mmu); + + return aspace; +} + static struct msm_gem_address_space * a6xx_create_private_address_space(struct msm_gpu *gpu) { @@ -1362,7 +1406,7 @@ static const struct adreno_gpu_funcs funcs = { .gpu_state_get = a6xx_gpu_state_get, .gpu_state_put = a6xx_gpu_state_put, #endif - .create_address_space = adreno_iommu_create_address_space, + .create_address_space = a6xx_create_address_space, .create_private_address_space = a6xx_create_private_address_space, .get_rptr = a6xx_get_rptr, }, diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index b35914de1b27..0f184c3dd9d9 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -186,11 +186,18 @@ int adreno_zap_shader_load(struct msm_gpu *gpu, u32 pasid) return zap_shader_load_mdt(gpu, adreno_gpu->info->zapfw, pasid); } +void adreno_set_llc_attributes(struct iommu_domain *iommu) +{ + struct io_pgtable_domain_attr pgtbl_cfg; + + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; + iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); +} + struct msm_gem_address_space * adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { - struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct iommu_domain *iommu; struct msm_mmu *mmu; struct msm_gem_address_space *aspace; @@ -200,20 +207,6 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL; - if (adreno_is_a6xx(adreno_gpu)) { - struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); - struct io_pgtable_domain_attr pgtbl_cfg; - - /* - * This allows GPU to set the bus attributes required to use system - * cache on behalf of the iommu page table walker. - */ - if (!IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice)) { - pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; - iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); - } - } - mmu = msm_iommu_new(&pdev->dev, iommu); if (IS_ERR(mmu)) { iommu_domain_free(iommu); diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index b3d9a333591b..2a3d049b46b5 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -212,11 +212,6 @@ static inline int adreno_is_a540(struct adreno_gpu *gpu) return gpu->revn == 540; } -static inline bool adreno_is_a6xx(struct adreno_gpu *gpu) -{ - return ((gpu->revn < 700 && gpu->revn > 599)); -} - static inline int adreno_is_a618(struct adreno_gpu *gpu) { return gpu->revn == 618; @@ -278,6 +273,8 @@ struct msm_gem_address_space * adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev); +void adreno_set_llc_attributes(struct iommu_domain *iommu); + /* * For a5xx and a6xx targets load the zap shader that is used to pull the GPU * out of secure mode From 8f03c30cb814213e36032084a01f49a9e604a3e3 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 13 Jan 2021 19:33:33 +0100 Subject: [PATCH 054/506] drm/msm/a5xx: Remove overwriting A5XX_PC_DBG_ECO_CNTL register The PC_DBG_ECO_CNTL register on the Adreno A5xx family gets programmed to some different values on a per-model basis. At least, this is what we intend to do here; Unfortunately, though, this register is being overwritten with a static magic number, right after applying the GPU-specific configuration (including the GPU-specific quirks) and that is effectively nullifying the efforts. Let's remove the redundant and wrong write to the PC_DBG_ECO_CNTL register in order to retain the wanted configuration for the target GPU. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index a5af223eaf50..81506d2539b0 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -626,8 +626,6 @@ static int a5xx_hw_init(struct msm_gpu *gpu) if (adreno_gpu->info->quirks & ADRENO_QUIRK_TWO_PASS_USE_WFI) gpu_rmw(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0, (1 << 8)); - gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0xc0200100); - /* Enable USE_RETENTION_FLOPS */ gpu_write(gpu, REG_A5XX_CP_CHICKEN_DBG, 0x02000000); From 4340b46ad1619c9607931a59d98d4d5093415475 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 13 Jan 2021 19:33:34 +0100 Subject: [PATCH 055/506] drm/msm/a5xx: Separate A5XX_PC_DBG_ECO_CNTL write from main branch The "main" if branch where we program the other registers for the Adreno 5xx family of GPUs should not contain the PC_DBG_ECO_CNTL register programming because this has logical similarity differences from all the others. A later commit will show the entire sense of this. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 81506d2539b0..8c96fc0fc1b7 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -609,8 +609,6 @@ static int a5xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x20); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_2, 0x40000030); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_1, 0x20100D0A); - gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, - (0x200 << 11 | 0x200 << 22)); } else { gpu_write(gpu, REG_A5XX_CP_MEQ_THRESHOLDS, 0x40); if (adreno_is_a530(adreno_gpu)) @@ -619,9 +617,14 @@ static int a5xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x400); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_2, 0x80000060); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_1, 0x40201B16); + } + + if (adreno_is_a510(adreno_gpu)) + gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, + (0x200 << 11 | 0x200 << 22)); + else gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, (0x400 << 11 | 0x300 << 22)); - } if (adreno_gpu->info->quirks & ADRENO_QUIRK_TWO_PASS_USE_WFI) gpu_rmw(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0, (1 << 8)); From 1d832ab30ce64abe30571bc12931a296a8a27c4d Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 13 Jan 2021 19:33:35 +0100 Subject: [PATCH 056/506] drm/msm/a5xx: Add support for Adreno 508, 509, 512 GPUs The Adreno 508/509/512 GPUs are stripped versions of the Adreno 5xx found in the mid-end SoCs such as SDM630, SDM636, SDM660 and SDA variants; these SoCs are usually provided with ZAP firmwares, but they have no available GPMU. Signed-off-by: AngeloGioacchino Del Regno Tested-by: Martin Botka Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 172 ++++++++++++++++++--- drivers/gpu/drm/msm/adreno/a5xx_power.c | 4 +- drivers/gpu/drm/msm/adreno/adreno_device.c | 52 +++++++ drivers/gpu/drm/msm/adreno/adreno_gpu.h | 15 ++ 4 files changed, 223 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 8c96fc0fc1b7..04ffd84c1190 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -222,7 +222,7 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) a5xx_preempt_trigger(gpu); } -static const struct { +static const struct adreno_five_hwcg_regs { u32 offset; u32 value; } a5xx_hwcg[] = { @@ -318,16 +318,124 @@ static const struct { {REG_A5XX_RBBM_CLOCK_DELAY_TSE_RAS_RBBM, 0x00004000}, {REG_A5XX_RBBM_CLOCK_DELAY_GPC, 0x00000200}, {REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222} +}, a50x_hwcg[] = { + {REG_A5XX_RBBM_CLOCK_CNTL_SP0, 0x02222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_SP0, 0x02222220}, + {REG_A5XX_RBBM_CLOCK_HYST_SP0, 0x0000F3CF}, + {REG_A5XX_RBBM_CLOCK_DELAY_SP0, 0x00000080}, + {REG_A5XX_RBBM_CLOCK_CNTL_TP0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_TP0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL3_TP0, 0x00002222}, + {REG_A5XX_RBBM_CLOCK_HYST_TP0, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST2_TP0, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST3_TP0, 0x00007777}, + {REG_A5XX_RBBM_CLOCK_DELAY_TP0, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY2_TP0, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY3_TP0, 0x00001111}, + {REG_A5XX_RBBM_CLOCK_CNTL2_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL3_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL4_UCHE, 0x00222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_HYST_UCHE, 0x00FFFFF4}, + {REG_A5XX_RBBM_CLOCK_DELAY_UCHE, 0x00000002}, + {REG_A5XX_RBBM_CLOCK_CNTL_RB0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_RB0, 0x00222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_CCU0, 0x00022220}, + {REG_A5XX_RBBM_CLOCK_CNTL_RAC, 0x05522222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_RAC, 0x00505555}, + {REG_A5XX_RBBM_CLOCK_HYST_RB_CCU0, 0x04040404}, + {REG_A5XX_RBBM_CLOCK_HYST_RAC, 0x07444044}, + {REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_0, 0x00000002}, + {REG_A5XX_RBBM_CLOCK_DELAY_RAC, 0x00010011}, + {REG_A5XX_RBBM_CLOCK_CNTL_TSE_RAS_RBBM, 0x04222222}, + {REG_A5XX_RBBM_CLOCK_MODE_GPC, 0x02222222}, + {REG_A5XX_RBBM_CLOCK_MODE_VFD, 0x00002222}, + {REG_A5XX_RBBM_CLOCK_HYST_TSE_RAS_RBBM, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_HYST_GPC, 0x04104004}, + {REG_A5XX_RBBM_CLOCK_HYST_VFD, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_DELAY_HLSQ, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_DELAY_TSE_RAS_RBBM, 0x00004000}, + {REG_A5XX_RBBM_CLOCK_DELAY_GPC, 0x00000200}, + {REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222}, +}, a512_hwcg[] = { + {REG_A5XX_RBBM_CLOCK_CNTL_SP0, 0x02222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_SP1, 0x02222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_SP0, 0x02222220}, + {REG_A5XX_RBBM_CLOCK_CNTL2_SP1, 0x02222220}, + {REG_A5XX_RBBM_CLOCK_HYST_SP0, 0x0000F3CF}, + {REG_A5XX_RBBM_CLOCK_HYST_SP1, 0x0000F3CF}, + {REG_A5XX_RBBM_CLOCK_DELAY_SP0, 0x00000080}, + {REG_A5XX_RBBM_CLOCK_DELAY_SP1, 0x00000080}, + {REG_A5XX_RBBM_CLOCK_CNTL_TP0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_TP1, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_TP0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_TP1, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL3_TP0, 0x00002222}, + {REG_A5XX_RBBM_CLOCK_CNTL3_TP1, 0x00002222}, + {REG_A5XX_RBBM_CLOCK_HYST_TP0, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST_TP1, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST2_TP0, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST2_TP1, 0x77777777}, + {REG_A5XX_RBBM_CLOCK_HYST3_TP0, 0x00007777}, + {REG_A5XX_RBBM_CLOCK_HYST3_TP1, 0x00007777}, + {REG_A5XX_RBBM_CLOCK_DELAY_TP0, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY_TP1, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY2_TP0, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY2_TP1, 0x11111111}, + {REG_A5XX_RBBM_CLOCK_DELAY3_TP0, 0x00001111}, + {REG_A5XX_RBBM_CLOCK_DELAY3_TP1, 0x00001111}, + {REG_A5XX_RBBM_CLOCK_CNTL_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL3_UCHE, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL4_UCHE, 0x00222222}, + {REG_A5XX_RBBM_CLOCK_HYST_UCHE, 0x00444444}, + {REG_A5XX_RBBM_CLOCK_DELAY_UCHE, 0x00000002}, + {REG_A5XX_RBBM_CLOCK_CNTL_RB0, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_RB1, 0x22222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_RB0, 0x00222222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_RB1, 0x00222222}, + {REG_A5XX_RBBM_CLOCK_CNTL_CCU0, 0x00022220}, + {REG_A5XX_RBBM_CLOCK_CNTL_CCU1, 0x00022220}, + {REG_A5XX_RBBM_CLOCK_CNTL_RAC, 0x05522222}, + {REG_A5XX_RBBM_CLOCK_CNTL2_RAC, 0x00505555}, + {REG_A5XX_RBBM_CLOCK_HYST_RB_CCU0, 0x04040404}, + {REG_A5XX_RBBM_CLOCK_HYST_RB_CCU1, 0x04040404}, + {REG_A5XX_RBBM_CLOCK_HYST_RAC, 0x07444044}, + {REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_0, 0x00000002}, + {REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_1, 0x00000002}, + {REG_A5XX_RBBM_CLOCK_DELAY_RAC, 0x00010011}, + {REG_A5XX_RBBM_CLOCK_CNTL_TSE_RAS_RBBM, 0x04222222}, + {REG_A5XX_RBBM_CLOCK_MODE_GPC, 0x02222222}, + {REG_A5XX_RBBM_CLOCK_MODE_VFD, 0x00002222}, + {REG_A5XX_RBBM_CLOCK_HYST_TSE_RAS_RBBM, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_HYST_GPC, 0x04104004}, + {REG_A5XX_RBBM_CLOCK_HYST_VFD, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_DELAY_HLSQ, 0x00000000}, + {REG_A5XX_RBBM_CLOCK_DELAY_TSE_RAS_RBBM, 0x00004000}, + {REG_A5XX_RBBM_CLOCK_DELAY_GPC, 0x00000200}, + {REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222}, }; void a5xx_set_hwcg(struct msm_gpu *gpu, bool state) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - unsigned int i; + const struct adreno_five_hwcg_regs *regs; + unsigned int i, sz; - for (i = 0; i < ARRAY_SIZE(a5xx_hwcg); i++) - gpu_write(gpu, a5xx_hwcg[i].offset, - state ? a5xx_hwcg[i].value : 0); + if (adreno_is_a508(adreno_gpu)) { + regs = a50x_hwcg; + sz = ARRAY_SIZE(a50x_hwcg); + } else if (adreno_is_a509(adreno_gpu) || adreno_is_a512(adreno_gpu)) { + regs = a512_hwcg; + sz = ARRAY_SIZE(a512_hwcg); + } else { + regs = a5xx_hwcg; + sz = ARRAY_SIZE(a5xx_hwcg); + } + + for (i = 0; i < sz; i++) + gpu_write(gpu, regs[i].offset, + state ? regs[i].value : 0); if (adreno_is_a540(adreno_gpu)) { gpu_write(gpu, REG_A5XX_RBBM_CLOCK_DELAY_GPMU, state ? 0x00000770 : 0); @@ -538,11 +646,13 @@ static int a5xx_hw_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + u32 regbit; int ret; gpu_write(gpu, REG_A5XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x00000003); - if (adreno_is_a540(adreno_gpu)) + if (adreno_is_a509(adreno_gpu) || adreno_is_a512(adreno_gpu) || + adreno_is_a540(adreno_gpu)) gpu_write(gpu, REG_A5XX_VBIF_GATE_OFF_WRREQ_EN, 0x00000009); /* Make all blocks contribute to the GPU BUSY perf counter */ @@ -604,22 +714,29 @@ static int a5xx_hw_init(struct msm_gpu *gpu) 0x00100000 + adreno_gpu->gmem - 1); gpu_write(gpu, REG_A5XX_UCHE_GMEM_RANGE_MAX_HI, 0x00000000); - if (adreno_is_a510(adreno_gpu)) { + if (adreno_is_a508(adreno_gpu) || adreno_is_a510(adreno_gpu)) { gpu_write(gpu, REG_A5XX_CP_MEQ_THRESHOLDS, 0x20); - gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x20); + if (adreno_is_a508(adreno_gpu)) + gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x400); + else + gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x20); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_2, 0x40000030); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_1, 0x20100D0A); } else { gpu_write(gpu, REG_A5XX_CP_MEQ_THRESHOLDS, 0x40); if (adreno_is_a530(adreno_gpu)) gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x40); - if (adreno_is_a540(adreno_gpu)) + else gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x400); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_2, 0x80000060); gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_1, 0x40201B16); } - if (adreno_is_a510(adreno_gpu)) + if (adreno_is_a508(adreno_gpu)) + gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, + (0x100 << 11 | 0x100 << 22)); + else if (adreno_is_a509(adreno_gpu) || adreno_is_a510(adreno_gpu) || + adreno_is_a512(adreno_gpu)) gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, (0x200 << 11 | 0x200 << 22)); else @@ -629,6 +746,14 @@ static int a5xx_hw_init(struct msm_gpu *gpu) if (adreno_gpu->info->quirks & ADRENO_QUIRK_TWO_PASS_USE_WFI) gpu_rmw(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0, (1 << 8)); + /* + * Disable the RB sampler datapath DP2 clock gating optimization + * for 1-SP GPUs, as it is enabled by default. + */ + if (adreno_is_a508(adreno_gpu) || adreno_is_a509(adreno_gpu) || + adreno_is_a512(adreno_gpu)) + gpu_rmw(gpu, REG_A5XX_RB_DBG_ECO_CNTL, 0, (1 << 9)); + /* Enable USE_RETENTION_FLOPS */ gpu_write(gpu, REG_A5XX_CP_CHICKEN_DBG, 0x02000000); @@ -654,10 +779,17 @@ static int a5xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL2, 0x0000003F); /* Set the highest bank bit */ - gpu_write(gpu, REG_A5XX_TPL1_MODE_CNTL, 2 << 7); - gpu_write(gpu, REG_A5XX_RB_MODE_CNTL, 2 << 1); if (adreno_is_a540(adreno_gpu)) - gpu_write(gpu, REG_A5XX_UCHE_DBG_ECO_CNTL_2, 2); + regbit = 2; + else + regbit = 1; + + gpu_write(gpu, REG_A5XX_TPL1_MODE_CNTL, regbit << 7); + gpu_write(gpu, REG_A5XX_RB_MODE_CNTL, regbit << 1); + + if (adreno_is_a509(adreno_gpu) || adreno_is_a512(adreno_gpu) || + adreno_is_a540(adreno_gpu)) + gpu_write(gpu, REG_A5XX_UCHE_DBG_ECO_CNTL_2, regbit); /* Protect registers from the CP */ gpu_write(gpu, REG_A5XX_CP_PROTECT_CNTL, 0x00000007); @@ -694,7 +826,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu) /* UCHE */ gpu_write(gpu, REG_A5XX_CP_PROTECT(16), ADRENO_PROTECT_RW(0xE80, 16)); - if (adreno_is_a530(adreno_gpu) || adreno_is_a510(adreno_gpu)) + if (adreno_is_a508(adreno_gpu) || adreno_is_a509(adreno_gpu) || + adreno_is_a510(adreno_gpu) || adreno_is_a512(adreno_gpu) || + adreno_is_a530(adreno_gpu)) gpu_write(gpu, REG_A5XX_CP_PROTECT(17), ADRENO_PROTECT_RW(0x10000, 0x8000)); @@ -736,7 +870,8 @@ static int a5xx_hw_init(struct msm_gpu *gpu) if (ret) return ret; - if (!adreno_is_a510(adreno_gpu)) + if (!(adreno_is_a508(adreno_gpu) || adreno_is_a509(adreno_gpu) || + adreno_is_a510(adreno_gpu) || adreno_is_a512(adreno_gpu))) a5xx_gpmu_ucode_init(gpu); ret = a5xx_ucode_init(gpu); @@ -1169,7 +1304,8 @@ static int a5xx_pm_resume(struct msm_gpu *gpu) if (ret) return ret; - if (adreno_is_a510(adreno_gpu)) { + /* Adreno 508, 509, 510, 512 needs manual RBBM sus/res control */ + if (!(adreno_is_a530(adreno_gpu) || adreno_is_a540(adreno_gpu))) { /* Halt the sp_input_clk at HM level */ gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, 0x00000055); a5xx_set_hwcg(gpu, true); @@ -1211,8 +1347,8 @@ static int a5xx_pm_suspend(struct msm_gpu *gpu) u32 mask = 0xf; int i, ret; - /* A510 has 3 XIN ports in VBIF */ - if (adreno_is_a510(adreno_gpu)) + /* A508, A510 have 3 XIN ports in VBIF */ + if (adreno_is_a508(adreno_gpu) || adreno_is_a510(adreno_gpu)) mask = 0x7; /* Clear the VBIF pipe before shutting down */ diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c b/drivers/gpu/drm/msm/adreno/a5xx_power.c index f176a6f3eff6..5ccc9da455a1 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_power.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c @@ -298,7 +298,7 @@ int a5xx_power_init(struct msm_gpu *gpu) int ret; /* Not all A5xx chips have a GPMU */ - if (adreno_is_a510(adreno_gpu)) + if (!(adreno_is_a530(adreno_gpu) || adreno_is_a540(adreno_gpu))) return 0; /* Set up the limits management */ @@ -330,7 +330,7 @@ void a5xx_gpmu_ucode_init(struct msm_gpu *gpu) unsigned int *data, *ptr, *cmds; unsigned int cmds_size; - if (adreno_is_a510(adreno_gpu)) + if (!(adreno_is_a530(adreno_gpu) || adreno_is_a540(adreno_gpu))) return; if (a5xx_gpu->gpmu_bo) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index ce9345cc29a4..600d445fabe8 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -133,6 +133,41 @@ static const struct adreno_info gpulist[] = { .gmem = (SZ_1M + SZ_512K), .inactive_period = DRM_MSM_INACTIVE_PERIOD, .init = a4xx_gpu_init, + }, { + .rev = ADRENO_REV(5, 0, 8, ANY_ID), + .revn = 508, + .name = "A508", + .fw = { + [ADRENO_FW_PM4] = "a530_pm4.fw", + [ADRENO_FW_PFP] = "a530_pfp.fw", + }, + .gmem = (SZ_128K + SZ_8K), + /* + * Increase inactive period to 250 to avoid bouncing + * the GDSC which appears to make it grumpy + */ + .inactive_period = 250, + .quirks = ADRENO_QUIRK_LMLOADKILL_DISABLE, + .init = a5xx_gpu_init, + .zapfw = "a508_zap.mdt", + }, { + .rev = ADRENO_REV(5, 0, 9, ANY_ID), + .revn = 509, + .name = "A509", + .fw = { + [ADRENO_FW_PM4] = "a530_pm4.fw", + [ADRENO_FW_PFP] = "a530_pfp.fw", + }, + .gmem = (SZ_256K + SZ_16K), + /* + * Increase inactive period to 250 to avoid bouncing + * the GDSC which appears to make it grumpy + */ + .inactive_period = 250, + .quirks = ADRENO_QUIRK_LMLOADKILL_DISABLE, + .init = a5xx_gpu_init, + /* Adreno 509 uses the same ZAP as 512 */ + .zapfw = "a512_zap.mdt", }, { .rev = ADRENO_REV(5, 1, 0, ANY_ID), .revn = 510, @@ -148,6 +183,23 @@ static const struct adreno_info gpulist[] = { */ .inactive_period = 250, .init = a5xx_gpu_init, + }, { + .rev = ADRENO_REV(5, 1, 2, ANY_ID), + .revn = 512, + .name = "A512", + .fw = { + [ADRENO_FW_PM4] = "a530_pm4.fw", + [ADRENO_FW_PFP] = "a530_pfp.fw", + }, + .gmem = (SZ_256K + SZ_16K), + /* + * Increase inactive period to 250 to avoid bouncing + * the GDSC which appears to make it grumpy + */ + .inactive_period = 250, + .quirks = ADRENO_QUIRK_LMLOADKILL_DISABLE, + .init = a5xx_gpu_init, + .zapfw = "a512_zap.mdt", }, { .rev = ADRENO_REV(5, 3, 0, 2), .revn = 530, diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index 2a3d049b46b5..ccac275aa7a2 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -197,11 +197,26 @@ static inline int adreno_is_a430(struct adreno_gpu *gpu) return gpu->revn == 430; } +static inline int adreno_is_a508(struct adreno_gpu *gpu) +{ + return gpu->revn == 508; +} + +static inline int adreno_is_a509(struct adreno_gpu *gpu) +{ + return gpu->revn == 509; +} + static inline int adreno_is_a510(struct adreno_gpu *gpu) { return gpu->revn == 510; } +static inline int adreno_is_a512(struct adreno_gpu *gpu) +{ + return gpu->revn == 512; +} + static inline int adreno_is_a530(struct adreno_gpu *gpu) { return gpu->revn == 530; From 9deba2b8a5b5e15e8603a6b06abc0e6493c0f868 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 13 Jan 2021 19:33:36 +0100 Subject: [PATCH 057/506] drm/msm/a5xx: Reset VBIF before PC only on A510 and A530 Resetting the VBIF before power collapse is done to avoid getting bogus FIFO entries during the suspend sequence or subsequent resume, but this is doable only on Adreno 510 and Adreno 530, as the other units will tendentially lock up. Especially on Adreno 508, the GPU will show lockups and very bad slownesses after processing the first frame. Avoiding to execute the RBBM SW Reset before suspend will stop the lockup issue from happening on at least Adreno 508/509/512. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 04ffd84c1190..66980f4cd93e 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1360,10 +1360,12 @@ static int a5xx_pm_suspend(struct msm_gpu *gpu) /* * Reset the VBIF before power collapse to avoid issue with FIFO - * entries + * entries on Adreno A510 and A530 (the others will tend to lock up) */ - gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x003C0000); - gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x00000000); + if (adreno_is_a510(adreno_gpu) || adreno_is_a530(adreno_gpu)) { + gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x003C0000); + gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x00000000); + } ret = msm_gpu_pm_suspend(gpu); if (ret) From 89c1ab960717fb0e9b61c8cf90316bd41309e90a Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 13 Jan 2021 19:33:37 +0100 Subject: [PATCH 058/506] drm/msm/a5xx: Fix VPC protect value in gpu_write() The upstream API for some reason uses logbase2 instead of just passing the argument as-is, whereas downstream CAF kernel does the latter. Hence, a mistake has been made when porting: 4 is the value that's supposed to be passed, but log2(4) = 2. Changing the value to 16 (= 2^4) fixes the issue. Signed-off-by: Konrad Dybcio Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 66980f4cd93e..24ab51bb5a01 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -821,7 +821,7 @@ static int a5xx_hw_init(struct msm_gpu *gpu) /* VPC */ gpu_write(gpu, REG_A5XX_CP_PROTECT(14), ADRENO_PROTECT_RW(0xE68, 8)); - gpu_write(gpu, REG_A5XX_CP_PROTECT(15), ADRENO_PROTECT_RW(0xE70, 4)); + gpu_write(gpu, REG_A5XX_CP_PROTECT(15), ADRENO_PROTECT_RW(0xE70, 16)); /* UCHE */ gpu_write(gpu, REG_A5XX_CP_PROTECT(16), ADRENO_PROTECT_RW(0xE80, 16)); From cce212d84415635ba9a848afd94e9f61444c9187 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 13 Jan 2021 19:33:38 +0100 Subject: [PATCH 059/506] drm/msm/a5xx: Disable flat shading optimization Port over the command from downstream to prevent undefined behaviour. Signed-off-by: Konrad Dybcio Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 24ab51bb5a01..23fc851756de 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -791,6 +791,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu) adreno_is_a540(adreno_gpu)) gpu_write(gpu, REG_A5XX_UCHE_DBG_ECO_CNTL_2, regbit); + /* Disable All flat shading optimization (ALLFLATOPTDIS) */ + gpu_rmw(gpu, REG_A5XX_VPC_DBG_ECO_CNTL, 0, (1 << 10)); + /* Protect registers from the CP */ gpu_write(gpu, REG_A5XX_CP_PROTECT_CNTL, 0x00000007); From 3f2bc3856bf175c751736c7323188a401ad10a0d Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 13 Jan 2021 19:33:39 +0100 Subject: [PATCH 060/506] drm/msm/a5xx: Disable UCHE global filter Port over the command from downstream to prevent undefined behaviour. Signed-off-by: Konrad Dybcio Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx.xml.h | 2 ++ drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a5xx.xml.h b/drivers/gpu/drm/msm/adreno/a5xx.xml.h index 346cc6ff3a36..7b9fcfe95c04 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx.xml.h +++ b/drivers/gpu/drm/msm/adreno/a5xx.xml.h @@ -2367,6 +2367,8 @@ static inline uint32_t A5XX_VSC_RESOLVE_CNTL_Y(uint32_t val) #define REG_A5XX_UCHE_ADDR_MODE_CNTL 0x00000e80 +#define REG_A5XX_UCHE_MODE_CNTL 0x00000e81 + #define REG_A5XX_UCHE_SVM_CNTL 0x00000e82 #define REG_A5XX_UCHE_WRITE_THRU_BASE_LO 0x00000e87 diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 23fc851756de..7e553d3efeb2 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -754,6 +754,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu) adreno_is_a512(adreno_gpu)) gpu_rmw(gpu, REG_A5XX_RB_DBG_ECO_CNTL, 0, (1 << 9)); + /* Disable UCHE global filter as SP can invalidate/flush independently */ + gpu_write(gpu, REG_A5XX_UCHE_MODE_CNTL, BIT(29)); + /* Enable USE_RETENTION_FLOPS */ gpu_write(gpu, REG_A5XX_CP_CHICKEN_DBG, 0x02000000); From e0485f1d042188bd501746d01a13417e327b40cc Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:26 +0100 Subject: [PATCH 061/506] drm/msm/dpu: Fix VBIF_XINL_QOS_LVL_REMAP_000 register offset On DPUs prior to version 4 the VBIF_XINL_QOS_LVL_REMAP_000 register is at 0x570 offset from vbif base instead of 0x590, due to the VBIF_XINL_QOS_RP_REMAP_000 having less instances (less possible XINs). Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_vbif.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_vbif.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_vbif.c index cf867f3f7c36..b757054e1c23 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_vbif.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_vbif.c @@ -30,7 +30,7 @@ #define VBIF_XIN_HALT_CTRL0 0x0200 #define VBIF_XIN_HALT_CTRL1 0x0204 #define VBIF_XINL_QOS_RP_REMAP_000 0x0550 -#define VBIF_XINL_QOS_LVL_REMAP_000 0x0590 +#define VBIF_XINL_QOS_LVL_REMAP_000(v) (v < DPU_HW_VER_400 ? 0x570 : 0x0590) static void dpu_hw_clear_errors(struct dpu_hw_vbif *vbif, u32 *pnd_errors, u32 *src_errors) @@ -156,18 +156,19 @@ static void dpu_hw_set_qos_remap(struct dpu_hw_vbif *vbif, u32 xin_id, u32 level, u32 remap_level) { struct dpu_hw_blk_reg_map *c; - u32 reg_val, reg_val_lvl, mask, reg_high, reg_shift; + u32 reg_lvl, reg_val, reg_val_lvl, mask, reg_high, reg_shift; if (!vbif) return; c = &vbif->hw; + reg_lvl = VBIF_XINL_QOS_LVL_REMAP_000(c->hwversion); reg_high = ((xin_id & 0x8) >> 3) * 4 + (level * 8); reg_shift = (xin_id & 0x7) * 4; reg_val = DPU_REG_READ(c, VBIF_XINL_QOS_RP_REMAP_000 + reg_high); - reg_val_lvl = DPU_REG_READ(c, VBIF_XINL_QOS_LVL_REMAP_000 + reg_high); + reg_val_lvl = DPU_REG_READ(c, reg_lvl + reg_high); mask = 0x7 << reg_shift; @@ -178,7 +179,7 @@ static void dpu_hw_set_qos_remap(struct dpu_hw_vbif *vbif, reg_val_lvl |= (remap_level << reg_shift) & mask; DPU_REG_WRITE(c, VBIF_XINL_QOS_RP_REMAP_000 + reg_high, reg_val); - DPU_REG_WRITE(c, VBIF_XINL_QOS_LVL_REMAP_000 + reg_high, reg_val_lvl); + DPU_REG_WRITE(c, reg_lvl + reg_high, reg_val_lvl); } static void dpu_hw_set_write_gather_en(struct dpu_hw_vbif *vbif, u32 xin_id) From b8dab65b5ac3dc31a80f415e38ebcc74b21ffbed Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:27 +0100 Subject: [PATCH 062/506] drm/msm/dpu: Move DPU_SSPP_QOS_8LVL bit to SDM845 and SC7180 masks Not all DPU versions that are supported in this driver are supposed to have a 8-Levels VIG QoS setting. Move this flag to SDM845 and SC7180 specific masks. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 90393fe9e59c..7a17ebaf635c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -12,14 +12,14 @@ #define VIG_MASK \ (BIT(DPU_SSPP_SRC) | BIT(DPU_SSPP_QOS) |\ - BIT(DPU_SSPP_CSC_10BIT) | BIT(DPU_SSPP_CDP) | BIT(DPU_SSPP_QOS_8LVL) |\ + BIT(DPU_SSPP_CSC_10BIT) | BIT(DPU_SSPP_CDP) |\ BIT(DPU_SSPP_TS_PREFILL) | BIT(DPU_SSPP_EXCL_RECT)) #define VIG_SDM845_MASK \ - (VIG_MASK | BIT(DPU_SSPP_SCALER_QSEED3)) + (VIG_MASK | BIT(DPU_SSPP_QOS_8LVL) | BIT(DPU_SSPP_SCALER_QSEED3)) #define VIG_SC7180_MASK \ - (VIG_MASK | BIT(DPU_SSPP_SCALER_QSEED4)) + (VIG_MASK | BIT(DPU_SSPP_QOS_8LVL) | BIT(DPU_SSPP_SCALER_QSEED4)) #define DMA_SDM845_MASK \ (BIT(DPU_SSPP_SRC) | BIT(DPU_SSPP_QOS) | BIT(DPU_SSPP_QOS_8LVL) |\ From 731806da2916f7b0f0b86ac5f80b287f08bfcf42 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:28 +0100 Subject: [PATCH 063/506] drm/msm/dpu: Add prog_fetch_lines_worst_case to INTF_BLK macro Not all DPU interface sub-block versions need the same value for prog_fetch_lines_worst_case: add this to the INTF_BLK macro, so that it becomes possible to vary it for other INTF versions. For example, this is needed to implement support for older SoCs, like MSM8998 and SDM630/660 and most probably will also be needed for future SoCs. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 7a17ebaf635c..48d490f65840 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -624,33 +624,33 @@ static const struct dpu_merge_3d_cfg sm8150_merge_3d[] = { /************************************************************* * INTF sub blocks config *************************************************************/ -#define INTF_BLK(_name, _id, _base, _type, _ctrl_id, _features) \ +#define INTF_BLK(_name, _id, _base, _type, _ctrl_id, _progfetch, _features) \ {\ .name = _name, .id = _id, \ .base = _base, .len = 0x280, \ .features = _features, \ .type = _type, \ .controller_id = _ctrl_id, \ - .prog_fetch_lines_worst_case = 24 \ + .prog_fetch_lines_worst_case = _progfetch \ } static const struct dpu_intf_cfg sdm845_intf[] = { - INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, INTF_SDM845_MASK), - INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, INTF_SDM845_MASK), - INTF_BLK("intf_2", INTF_2, 0x6B000, INTF_DSI, 1, INTF_SDM845_MASK), - INTF_BLK("intf_3", INTF_3, 0x6B800, INTF_DP, 1, INTF_SDM845_MASK), + INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, 24, INTF_SDM845_MASK), + INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, 24, INTF_SDM845_MASK), + INTF_BLK("intf_2", INTF_2, 0x6B000, INTF_DSI, 1, 24, INTF_SDM845_MASK), + INTF_BLK("intf_3", INTF_3, 0x6B800, INTF_DP, 1, 24, INTF_SDM845_MASK), }; static const struct dpu_intf_cfg sc7180_intf[] = { - INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, INTF_SC7180_MASK), - INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, INTF_SC7180_MASK), + INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, 24, INTF_SC7180_MASK), + INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, 24, INTF_SC7180_MASK), }; static const struct dpu_intf_cfg sm8150_intf[] = { - INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, INTF_SC7180_MASK), - INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, INTF_SC7180_MASK), - INTF_BLK("intf_2", INTF_2, 0x6B000, INTF_DSI, 1, INTF_SC7180_MASK), - INTF_BLK("intf_3", INTF_3, 0x6B800, INTF_DP, 1, INTF_SC7180_MASK), + INTF_BLK("intf_0", INTF_0, 0x6A000, INTF_DP, 0, 24, INTF_SC7180_MASK), + INTF_BLK("intf_1", INTF_1, 0x6A800, INTF_DSI, 0, 24, INTF_SC7180_MASK), + INTF_BLK("intf_2", INTF_2, 0x6B000, INTF_DSI, 1, 24, INTF_SC7180_MASK), + INTF_BLK("intf_3", INTF_3, 0x6B800, INTF_DP, 1, 24, INTF_SC7180_MASK), }; /************************************************************* From 862314bc94ddd4d1a8634cdacf279d302157c7b2 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:29 +0100 Subject: [PATCH 064/506] drm/msm/dpu: Allow specifying features and sblk in DSPP_BLK macro The DSPP_BLK macro was ad-hoc made for SC7180, but this is wrong because not all of the DPU DSPP versions can use the same DSPP block configuration, and not all of them have got the same features. For this reason, add two more params to the DSPP_BLK macro, so that it is possible to specify the feature mask and the sblk config for each DSPP. Fixes: 4259ff7ae509 ("drm/msm/dpu: add support for pcc color block in dpu driver") Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 48d490f65840..2e0f6f726a1b 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -532,23 +532,28 @@ static const struct dpu_dspp_sub_blks sm8150_dspp_sblk = { .len = 0x90, .version = 0x40000}, }; -#define DSPP_BLK(_name, _id, _base, _sblk) \ +#define DSPP_BLK(_name, _id, _base, _mask, _sblk) \ {\ .name = _name, .id = _id, \ .base = _base, .len = 0x1800, \ - .features = DSPP_SC7180_MASK, \ + .features = _mask, \ .sblk = _sblk \ } static const struct dpu_dspp_cfg sc7180_dspp[] = { - DSPP_BLK("dspp_0", DSPP_0, 0x54000, &sc7180_dspp_sblk), + DSPP_BLK("dspp_0", DSPP_0, 0x54000, DSPP_SC7180_MASK, + &sc7180_dspp_sblk), }; static const struct dpu_dspp_cfg sm8150_dspp[] = { - DSPP_BLK("dspp_0", DSPP_0, 0x54000, &sm8150_dspp_sblk), - DSPP_BLK("dspp_1", DSPP_1, 0x56000, &sm8150_dspp_sblk), - DSPP_BLK("dspp_2", DSPP_2, 0x58000, &sm8150_dspp_sblk), - DSPP_BLK("dspp_3", DSPP_3, 0x5a000, &sm8150_dspp_sblk), + DSPP_BLK("dspp_0", DSPP_0, 0x54000, DSPP_SC7180_MASK, + &sm8150_dspp_sblk), + DSPP_BLK("dspp_1", DSPP_1, 0x56000, DSPP_SC7180_MASK, + &sm8150_dspp_sblk), + DSPP_BLK("dspp_2", DSPP_2, 0x58000, DSPP_SC7180_MASK, + &sm8150_dspp_sblk), + DSPP_BLK("dspp_3", DSPP_3, 0x5a000, DSPP_SC7180_MASK, + &sm8150_dspp_sblk), }; /************************************************************* From aa9223a602544fea033b63e1b1d6c8d9eaa4ed43 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:30 +0100 Subject: [PATCH 065/506] drm/msm/dpu: Disable autorefresh in command mode When a command mode display is used, it may be retaining the bootloader configuration which, in most of the cases, enables the autorefresh feature in order to keep the splash up. Since there is no autorefresh management in this driver, wire up the autorefresh ops in the dpu_hw_pingpong and disable the feature when preparing for cmd commit: instead of disabling it when initializing the command mode, this road was chosen as to open future possibility of enabling and managing the autorefresh feature in the driver. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- .../drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 68 +++++++++++++++++++ .../gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c | 26 +++++++ .../gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.h | 14 ++++ 3 files changed, 108 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c index 5a056c1191df..4d3481baaead 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c @@ -4,8 +4,10 @@ */ #define pr_fmt(fmt) "[drm:%s:%d] " fmt, __func__, __LINE__ +#include #include "dpu_encoder_phys.h" #include "dpu_hw_interrupts.h" +#include "dpu_hw_pingpong.h" #include "dpu_core_irq.h" #include "dpu_formats.h" #include "dpu_trace.h" @@ -35,6 +37,8 @@ #define DPU_ENC_WR_PTR_START_TIMEOUT_US 20000 +#define DPU_ENC_MAX_POLL_TIMEOUT_US 2000 + static bool dpu_encoder_phys_cmd_is_master(struct dpu_encoder_phys *phys_enc) { return (phys_enc->split_role != ENC_ROLE_SLAVE) ? true : false; @@ -580,6 +584,69 @@ static void dpu_encoder_phys_cmd_prepare_for_kickoff( atomic_read(&phys_enc->pending_kickoff_cnt)); } +static bool dpu_encoder_phys_cmd_is_ongoing_pptx( + struct dpu_encoder_phys *phys_enc) +{ + struct dpu_hw_pp_vsync_info info; + + if (!phys_enc) + return false; + + phys_enc->hw_pp->ops.get_vsync_info(phys_enc->hw_pp, &info); + if (info.wr_ptr_line_count > 0 && + info.wr_ptr_line_count < phys_enc->cached_mode.vdisplay) + return true; + + return false; +} + +static void dpu_encoder_phys_cmd_prepare_commit( + struct dpu_encoder_phys *phys_enc) +{ + struct dpu_encoder_phys_cmd *cmd_enc = + to_dpu_encoder_phys_cmd(phys_enc); + int trial = 0; + + if (!phys_enc) + return; + if (!phys_enc->hw_pp) + return; + if (!dpu_encoder_phys_cmd_is_master(phys_enc)) + return; + + /* If autorefresh is already disabled, we have nothing to do */ + if (!phys_enc->hw_pp->ops.get_autorefresh(phys_enc->hw_pp, NULL)) + return; + + /* + * If autorefresh is enabled, disable it and make sure it is safe to + * proceed with current frame commit/push. Sequence fallowed is, + * 1. Disable TE + * 2. Disable autorefresh config + * 4. Poll for frame transfer ongoing to be false + * 5. Enable TE back + */ + _dpu_encoder_phys_cmd_connect_te(phys_enc, false); + phys_enc->hw_pp->ops.setup_autorefresh(phys_enc->hw_pp, 0, false); + + do { + udelay(DPU_ENC_MAX_POLL_TIMEOUT_US); + if ((trial * DPU_ENC_MAX_POLL_TIMEOUT_US) + > (KICKOFF_TIMEOUT_MS * USEC_PER_MSEC)) { + DPU_ERROR_CMDENC(cmd_enc, + "disable autorefresh failed\n"); + break; + } + + trial++; + } while (dpu_encoder_phys_cmd_is_ongoing_pptx(phys_enc)); + + _dpu_encoder_phys_cmd_connect_te(phys_enc, true); + + DPU_DEBUG_CMDENC(to_dpu_encoder_phys_cmd(phys_enc), + "disabled autorefresh\n"); +} + static int _dpu_encoder_phys_cmd_wait_for_ctl_start( struct dpu_encoder_phys *phys_enc) { @@ -681,6 +748,7 @@ static void dpu_encoder_phys_cmd_trigger_start( static void dpu_encoder_phys_cmd_init_ops( struct dpu_encoder_phys_ops *ops) { + ops->prepare_commit = dpu_encoder_phys_cmd_prepare_commit; ops->is_master = dpu_encoder_phys_cmd_is_master; ops->mode_set = dpu_encoder_phys_cmd_mode_set; ops->mode_fixup = dpu_encoder_phys_cmd_mode_fixup; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c index bea4ab5c58c5..245a7a62b5c6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c @@ -23,6 +23,7 @@ #define PP_WR_PTR_IRQ 0x024 #define PP_OUT_LINE_COUNT 0x028 #define PP_LINE_COUNT 0x02C +#define PP_AUTOREFRESH_CONFIG 0x030 #define PP_FBC_MODE 0x034 #define PP_FBC_BUDGET_CTL 0x038 @@ -120,6 +121,29 @@ static int dpu_hw_pp_setup_te_config(struct dpu_hw_pingpong *pp, return 0; } +static void dpu_hw_pp_setup_autorefresh_config(struct dpu_hw_pingpong *pp, + u32 frame_count, bool enable) +{ + DPU_REG_WRITE(&pp->hw, PP_AUTOREFRESH_CONFIG, + enable ? (BIT(31) | frame_count) : 0); +} + +/* + * dpu_hw_pp_get_autorefresh_config - Get autorefresh config from HW + * @pp: DPU pingpong structure + * @frame_count: Used to return the current frame count from hw + * + * Returns: True if autorefresh enabled, false if disabled. + */ +static bool dpu_hw_pp_get_autorefresh_config(struct dpu_hw_pingpong *pp, + u32 *frame_count) +{ + u32 val = DPU_REG_READ(&pp->hw, PP_AUTOREFRESH_CONFIG); + if (frame_count != NULL) + *frame_count = val & 0xffff; + return !!((val & BIT(31)) >> 31); +} + static int dpu_hw_pp_poll_timeout_wr_ptr(struct dpu_hw_pingpong *pp, u32 timeout_us) { @@ -228,6 +252,8 @@ static void _setup_pingpong_ops(struct dpu_hw_pingpong *c, c->ops.enable_tearcheck = dpu_hw_pp_enable_te; c->ops.connect_external_te = dpu_hw_pp_connect_external_te; c->ops.get_vsync_info = dpu_hw_pp_get_vsync_info; + c->ops.setup_autorefresh = dpu_hw_pp_setup_autorefresh_config; + c->ops.get_autorefresh = dpu_hw_pp_get_autorefresh_config; c->ops.poll_timeout_wr_ptr = dpu_hw_pp_poll_timeout_wr_ptr; c->ops.get_line_count = dpu_hw_pp_get_line_count; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.h index 6902b9b95c8e..845b9ce80e31 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.h @@ -63,6 +63,8 @@ struct dpu_hw_dither_cfg { * @setup_tearcheck : program tear check values * @enable_tearcheck : enables tear check * @get_vsync_info : retries timing info of the panel + * @setup_autorefresh : configure and enable the autorefresh config + * @get_autorefresh : retrieve autorefresh config from hardware * @setup_dither : function to program the dither hw block * @get_line_count: obtain current vertical line counter */ @@ -94,6 +96,18 @@ struct dpu_hw_pingpong_ops { int (*get_vsync_info)(struct dpu_hw_pingpong *pp, struct dpu_hw_pp_vsync_info *info); + /** + * configure and enable the autorefresh config + */ + void (*setup_autorefresh)(struct dpu_hw_pingpong *pp, + u32 frame_count, bool enable); + + /** + * retrieve autorefresh config from hardware + */ + bool (*get_autorefresh)(struct dpu_hw_pingpong *pp, + u32 *frame_count); + /** * poll until write pointer transmission starts * @Return: 0 on success, -ETIMEDOUT on timeout From da9e7b7696d851a7229257368d12d95bbd210f5d Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:31 +0100 Subject: [PATCH 066/506] drm/msm/dpu: Correctly configure vsync tearcheck for command mode When configuring the tearcheck, the parameters for the engine were being set mostly as they should've been, but then it wasn't getting configured to get the vsync indication from the TE GPIO input because it was assumed that autorefresh could be enabled: since a previous commit makes sure to disable the autorefresh bit when committing to the cmd engine, it is now safe to just enable the vsync pin input at tearcheck setup time (instead of erroneously never enabling it). Also, set the right sync_cfg_height to enable the DPU auto-generated TE signal in order to avoid stalls in the event that we miss one external TE signal: this will still trigger recovery mechanisms in case the display is really unreachable. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c index 4d3481baaead..665eb1d4cb8a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c @@ -372,15 +372,12 @@ static void dpu_encoder_phys_cmd_tearcheck_config( tc_cfg.vsync_count = vsync_hz / (mode->vtotal * drm_mode_vrefresh(mode)); - /* enable external TE after kickoff to avoid premature autorefresh */ - tc_cfg.hw_vsync_mode = 0; - /* - * By setting sync_cfg_height to near max register value, we essentially - * disable dpu hw generated TE signal, since hw TE will arrive first. - * Only caveat is if due to error, we hit wrap-around. + * Set the sync_cfg_height to twice vtotal so that if we lose a + * TE event coming from the display TE pin we won't stall immediately */ - tc_cfg.sync_cfg_height = 0xFFF0; + tc_cfg.hw_vsync_mode = 1; + tc_cfg.sync_cfg_height = mode->vtotal * 2; tc_cfg.vsync_init_val = mode->vdisplay; tc_cfg.sync_threshold_start = DEFAULT_TEARCHECK_SYNC_THRESH_START; tc_cfg.sync_threshold_continue = DEFAULT_TEARCHECK_SYNC_THRESH_CONTINUE; From fe286893ed34b12684659d3efb907d47fe18559b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Jan 2021 20:26:32 +0100 Subject: [PATCH 067/506] drm/msm/dpu: Remove unused call in wait_for_commit_done The call to dpu_encoder_phys_cmd_prepare_for_kickoff is useless as it's unused because the serialize_wait4pp variable is never set to true by .. anything, literally: remove the call. While at it, also reduce indentation by inverting the check for dpu_encoder_phys_cmd_is_master. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c index 665eb1d4cb8a..b2be39b9144e 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c @@ -685,20 +685,15 @@ static int dpu_encoder_phys_cmd_wait_for_tx_complete( static int dpu_encoder_phys_cmd_wait_for_commit_done( struct dpu_encoder_phys *phys_enc) { - int rc = 0; struct dpu_encoder_phys_cmd *cmd_enc; cmd_enc = to_dpu_encoder_phys_cmd(phys_enc); /* only required for master controller */ - if (dpu_encoder_phys_cmd_is_master(phys_enc)) - rc = _dpu_encoder_phys_cmd_wait_for_ctl_start(phys_enc); + if (!dpu_encoder_phys_cmd_is_master(phys_enc)) + return 0; - /* required for both controllers */ - if (!rc && cmd_enc->serialize_wait4pp) - dpu_encoder_phys_cmd_prepare_for_kickoff(phys_enc); - - return rc; + return _dpu_encoder_phys_cmd_wait_for_ctl_start(phys_enc); } static int dpu_encoder_phys_cmd_wait_for_vblank( From 9fc418430c6584827b188855e698c599a7ddc0f1 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Wed, 13 Jan 2021 11:00:00 -0800 Subject: [PATCH 068/506] drm/msm/dp: unplug interrupt missed after irq_hpd handler There is HPD unplug interrupts missed at scenario of an irq_hpd followed by unplug interrupts with around 10 ms in between. Since both AUX_SW_RESET and DP_SW_RESET clear pending HPD interrupts, irq_hpd handler should not issues either aux or sw reset to avoid following unplug interrupt be cleared accidentally. This patch also postpone handling of irq_hpd until connected state if it happened at connection pending state. Changes in V2: -- add postpone handling of irq_hpd until connected state -- check DP_TRAINING_1 instead of DP_TRAINING_NONE Signed-off-by: Kuogee Hsieh Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_aux.c | 7 ------- drivers/gpu/drm/msm/dp/dp_catalog.c | 24 ++++++++++++++++++++++++ drivers/gpu/drm/msm/dp/dp_ctrl.c | 15 ++++++++++----- drivers/gpu/drm/msm/dp/dp_display.c | 7 +++++++ 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_aux.c b/drivers/gpu/drm/msm/dp/dp_aux.c index 19b35ae3e927..1c6e1d2b947c 100644 --- a/drivers/gpu/drm/msm/dp/dp_aux.c +++ b/drivers/gpu/drm/msm/dp/dp_aux.c @@ -336,7 +336,6 @@ static ssize_t dp_aux_transfer(struct drm_dp_aux *dp_aux, ssize_t ret; int const aux_cmd_native_max = 16; int const aux_cmd_i2c_max = 128; - int const retry_count = 5; struct dp_aux_private *aux = container_of(dp_aux, struct dp_aux_private, dp_aux); @@ -378,12 +377,6 @@ static ssize_t dp_aux_transfer(struct drm_dp_aux *dp_aux, ret = dp_aux_cmd_fifo_tx(aux, msg); if (ret < 0) { - if (aux->native) { - aux->retry_cnt++; - if (!(aux->retry_cnt % retry_count)) - dp_catalog_aux_update_cfg(aux->catalog); - dp_catalog_aux_reset(aux->catalog); - } usleep_range(400, 500); /* at least 400us to next try */ goto unlock_exit; } diff --git a/drivers/gpu/drm/msm/dp/dp_catalog.c b/drivers/gpu/drm/msm/dp/dp_catalog.c index 44f0c57798d0..b1a9b1b98f5f 100644 --- a/drivers/gpu/drm/msm/dp/dp_catalog.c +++ b/drivers/gpu/drm/msm/dp/dp_catalog.c @@ -190,6 +190,18 @@ int dp_catalog_aux_clear_hw_interrupts(struct dp_catalog *dp_catalog) return 0; } +/** + * dp_catalog_aux_reset() - reset AUX controller + * + * @aux: DP catalog structure + * + * return: void + * + * This function reset AUX controller + * + * NOTE: reset AUX controller will also clear any pending HPD related interrupts + * + */ void dp_catalog_aux_reset(struct dp_catalog *dp_catalog) { u32 aux_ctrl; @@ -483,6 +495,18 @@ int dp_catalog_ctrl_set_pattern(struct dp_catalog *dp_catalog, return 0; } +/** + * dp_catalog_ctrl_reset() - reset DP controller + * + * @dp_catalog: DP catalog structure + * + * return: void + * + * This function reset the DP controller + * + * NOTE: reset DP controller will also clear any pending HPD related interrupts + * + */ void dp_catalog_ctrl_reset(struct dp_catalog *dp_catalog) { u32 sw_reset; diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index e3462f5d96d7..5ac155d6b85f 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -1296,7 +1296,8 @@ static int dp_ctrl_setup_main_link(struct dp_ctrl_private *ctrl, * transitioned to PUSH_IDLE. In order to start transmitting * a link training pattern, we have to first do soft reset. */ - dp_catalog_ctrl_reset(ctrl->catalog); + if (*training_step == DP_TRAINING_1) + dp_catalog_ctrl_reset(ctrl->catalog); ret = dp_ctrl_link_train(ctrl, cr, training_step); @@ -1491,15 +1492,18 @@ static int dp_ctrl_deinitialize_mainlink(struct dp_ctrl_private *ctrl) return 0; } +static void dp_ctrl_link_idle_reset(struct dp_ctrl_private *ctrl) +{ + dp_ctrl_push_idle(&ctrl->dp_ctrl); + dp_catalog_ctrl_reset(ctrl->catalog); +} + static int dp_ctrl_link_maintenance(struct dp_ctrl_private *ctrl) { int ret = 0; struct dp_cr_status cr; int training_step = DP_TRAINING_NONE; - dp_ctrl_push_idle(&ctrl->dp_ctrl); - dp_catalog_ctrl_reset(ctrl->catalog); - ctrl->dp_ctrl.pixel_rate = ctrl->panel->dp_mode.drm_mode.clock; ret = dp_ctrl_setup_main_link(ctrl, &cr, &training_step); @@ -1626,6 +1630,7 @@ void dp_ctrl_handle_sink_request(struct dp_ctrl *dp_ctrl) if (sink_request & DP_TEST_LINK_TRAINING) { dp_link_send_test_response(ctrl->link); + dp_ctrl_link_idle_reset(ctrl); if (dp_ctrl_link_maintenance(ctrl)) { DRM_ERROR("LM failed: TEST_LINK_TRAINING\n"); return; @@ -1679,7 +1684,7 @@ int dp_ctrl_on_link(struct dp_ctrl *dp_ctrl) break; } - training_step = DP_TRAINING_NONE; + training_step = DP_TRAINING_1; rc = dp_ctrl_setup_main_link(ctrl, &cr, &training_step); if (rc == 0) { /* training completed successfully */ diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 3bc7ed21de28..3a7e6c5adbf5 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -700,6 +700,13 @@ static int dp_irq_hpd_handle(struct dp_display_private *dp, u32 data) return 0; } + if (state == ST_CONNECT_PENDING) { + /* wait until ST_CONNECTED */ + dp_add_event(dp, EV_IRQ_HPD_INT, 0, 1); /* delay = 1 */ + mutex_unlock(&dp->event_mutex); + return 0; + } + ret = dp_display_usbpd_attention_cb(&dp->pdev->dev); if (ret == -ECONNRESET) { /* cable unplugged */ dp->core_initialized = false; From d21fc5dfc3dfe51429b7b55f23e79db7815048c7 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 15 Jan 2021 18:38:11 +0300 Subject: [PATCH 069/506] drm/msm/dpu1: add support for qseed3lite used on sm8250 SM8250 has quite unique qseed lut type: qseed3lite, which is a lightweight version of qseed3 scaler. Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark --- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 38 +++++++++- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h | 2 + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c | 1 + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.h | 1 + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.c | 73 ++++++++++++++++++- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.h | 3 + drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 1 + 7 files changed, 112 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 2e0f6f726a1b..189f3533525c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -21,6 +21,9 @@ #define VIG_SC7180_MASK \ (VIG_MASK | BIT(DPU_SSPP_QOS_8LVL) | BIT(DPU_SSPP_SCALER_QSEED4)) +#define VIG_SM8250_MASK \ + (VIG_MASK | BIT(DPU_SSPP_SCALER_QSEED3LITE)) + #define DMA_SDM845_MASK \ (BIT(DPU_SSPP_SRC) | BIT(DPU_SSPP_QOS) | BIT(DPU_SSPP_QOS_8LVL) |\ BIT(DPU_SSPP_TS_PREFILL) | BIT(DPU_SSPP_TS_PREFILL_REC1) |\ @@ -185,7 +188,7 @@ static const struct dpu_caps sm8150_dpu_caps = { static const struct dpu_caps sm8250_dpu_caps = { .max_mixer_width = DEFAULT_DPU_OUTPUT_LINE_WIDTH, .max_mixer_blendstages = 0xb, - .qseed_type = DPU_SSPP_SCALER_QSEED3, /* TODO: qseed3 lite */ + .qseed_type = DPU_SSPP_SCALER_QSEED3LITE, .smart_dma_rev = DPU_SSPP_SMART_DMA_V2, /* TODO: v2.5 */ .ubwc_version = DPU_HW_UBWC_VER_40, .has_src_split = true, @@ -444,6 +447,34 @@ static const struct dpu_sspp_cfg sc7180_sspp[] = { sdm845_dma_sblk_2, 9, SSPP_TYPE_DMA, DPU_CLK_CTRL_CURSOR1), }; +static const struct dpu_sspp_sub_blks sm8250_vig_sblk_0 = + _VIG_SBLK("0", 5, DPU_SSPP_SCALER_QSEED3LITE); +static const struct dpu_sspp_sub_blks sm8250_vig_sblk_1 = + _VIG_SBLK("1", 6, DPU_SSPP_SCALER_QSEED3LITE); +static const struct dpu_sspp_sub_blks sm8250_vig_sblk_2 = + _VIG_SBLK("2", 7, DPU_SSPP_SCALER_QSEED3LITE); +static const struct dpu_sspp_sub_blks sm8250_vig_sblk_3 = + _VIG_SBLK("3", 8, DPU_SSPP_SCALER_QSEED3LITE); + +static const struct dpu_sspp_cfg sm8250_sspp[] = { + SSPP_BLK("sspp_0", SSPP_VIG0, 0x4000, VIG_SM8250_MASK, + sm8250_vig_sblk_0, 0, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG0), + SSPP_BLK("sspp_1", SSPP_VIG1, 0x6000, VIG_SM8250_MASK, + sm8250_vig_sblk_1, 4, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG1), + SSPP_BLK("sspp_2", SSPP_VIG2, 0x8000, VIG_SM8250_MASK, + sm8250_vig_sblk_2, 8, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG2), + SSPP_BLK("sspp_3", SSPP_VIG3, 0xa000, VIG_SM8250_MASK, + sm8250_vig_sblk_3, 12, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG3), + SSPP_BLK("sspp_8", SSPP_DMA0, 0x24000, DMA_SDM845_MASK, + sdm845_dma_sblk_0, 1, SSPP_TYPE_DMA, DPU_CLK_CTRL_DMA0), + SSPP_BLK("sspp_9", SSPP_DMA1, 0x26000, DMA_SDM845_MASK, + sdm845_dma_sblk_1, 5, SSPP_TYPE_DMA, DPU_CLK_CTRL_DMA1), + SSPP_BLK("sspp_10", SSPP_DMA2, 0x28000, DMA_CURSOR_SDM845_MASK, + sdm845_dma_sblk_2, 9, SSPP_TYPE_DMA, DPU_CLK_CTRL_CURSOR0), + SSPP_BLK("sspp_11", SSPP_DMA3, 0x2a000, DMA_CURSOR_SDM845_MASK, + sdm845_dma_sblk_3, 13, SSPP_TYPE_DMA, DPU_CLK_CTRL_CURSOR1), +}; + /************************************************************* * MIXER sub blocks config *************************************************************/ @@ -974,9 +1005,8 @@ static void sm8250_cfg_init(struct dpu_mdss_cfg *dpu_cfg) .mdp = sm8250_mdp, .ctl_count = ARRAY_SIZE(sm8150_ctl), .ctl = sm8150_ctl, - /* TODO: sspp qseed version differs from 845 */ - .sspp_count = ARRAY_SIZE(sdm845_sspp), - .sspp = sdm845_sspp, + .sspp_count = ARRAY_SIZE(sm8250_sspp), + .sspp = sm8250_sspp, .mixer_count = ARRAY_SIZE(sm8150_lm), .mixer = sm8150_lm, .dspp_count = ARRAY_SIZE(sm8150_dspp), diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h index eaef99db2d2f..ea4647d21a20 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h @@ -95,6 +95,7 @@ enum { * @DPU_SSPP_SRC Src and fetch part of the pipes, * @DPU_SSPP_SCALER_QSEED2, QSEED2 algorithm support * @DPU_SSPP_SCALER_QSEED3, QSEED3 alogorithm support + * @DPU_SSPP_SCALER_QSEED3LITE, QSEED3 Lite alogorithm support * @DPU_SSPP_SCALER_QSEED4, QSEED4 algorithm support * @DPU_SSPP_SCALER_RGB, RGB Scaler, supported by RGB pipes * @DPU_SSPP_CSC, Support of Color space converion @@ -114,6 +115,7 @@ enum { DPU_SSPP_SRC = 0x1, DPU_SSPP_SCALER_QSEED2, DPU_SSPP_SCALER_QSEED3, + DPU_SSPP_SCALER_QSEED3LITE, DPU_SSPP_SCALER_QSEED4, DPU_SSPP_SCALER_RGB, DPU_SSPP_CSC, diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c index 2c2ca5335aa8..34d81aa16041 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c @@ -673,6 +673,7 @@ static void _setup_layer_ops(struct dpu_hw_pipe *c, c->ops.setup_multirect = dpu_hw_sspp_setup_multirect; if (test_bit(DPU_SSPP_SCALER_QSEED3, &features) || + test_bit(DPU_SSPP_SCALER_QSEED3LITE, &features) || test_bit(DPU_SSPP_SCALER_QSEED4, &features)) { c->ops.setup_scaler = _dpu_hw_sspp_setup_scaler3; c->ops.get_scaler_ver = _dpu_hw_sspp_get_scaler3_ver; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.h index 85b018a9b03c..fdfd4b46e2c6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.h @@ -28,6 +28,7 @@ struct dpu_hw_pipe; #define DPU_SSPP_SCALER ((1UL << DPU_SSPP_SCALER_RGB) | \ (1UL << DPU_SSPP_SCALER_QSEED2) | \ (1UL << DPU_SSPP_SCALER_QSEED3) | \ + (1UL << DPU_SSPP_SCALER_QSEED3LITE) | \ (1UL << DPU_SSPP_SCALER_QSEED4)) /** diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.c index 84e9875994a8..f94584c982cd 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.c @@ -59,6 +59,19 @@ static u32 dpu_hw_util_log_mask = DPU_DBG_MASK_NONE; #define QSEED3_SEP_LUT_SIZE \ (QSEED3_LUT_SIZE * QSEED3_SEPARABLE_LUTS * sizeof(u32)) +/* DPU_SCALER_QSEED3LITE */ +#define QSEED3LITE_COEF_LUT_Y_SEP_BIT 4 +#define QSEED3LITE_COEF_LUT_UV_SEP_BIT 5 +#define QSEED3LITE_COEF_LUT_CTRL 0x4C +#define QSEED3LITE_COEF_LUT_SWAP_BIT 0 +#define QSEED3LITE_DIR_FILTER_WEIGHT 0x60 +#define QSEED3LITE_FILTERS 2 +#define QSEED3LITE_SEPARABLE_LUTS 10 +#define QSEED3LITE_LUT_SIZE 33 +#define QSEED3LITE_SEP_LUT_SIZE \ + (QSEED3LITE_LUT_SIZE * QSEED3LITE_SEPARABLE_LUTS * sizeof(u32)) + + void dpu_reg_write(struct dpu_hw_blk_reg_map *c, u32 reg_off, u32 val, @@ -156,6 +169,57 @@ static void _dpu_hw_setup_scaler3_lut(struct dpu_hw_blk_reg_map *c, } +static void _dpu_hw_setup_scaler3lite_lut(struct dpu_hw_blk_reg_map *c, + struct dpu_hw_scaler3_cfg *scaler3_cfg, u32 offset) +{ + int j, filter; + int config_lut = 0x0; + unsigned long lut_flags; + u32 lut_addr, lut_offset; + u32 *lut[QSEED3LITE_FILTERS] = {NULL, NULL}; + static const uint32_t off_tbl[QSEED3_FILTERS] = { 0x000, 0x200 }; + + DPU_REG_WRITE(c, QSEED3LITE_DIR_FILTER_WEIGHT + offset, scaler3_cfg->dir_weight); + + if (!scaler3_cfg->sep_lut) + return; + + lut_flags = (unsigned long) scaler3_cfg->lut_flag; + if (test_bit(QSEED3_COEF_LUT_Y_SEP_BIT, &lut_flags) && + (scaler3_cfg->y_rgb_sep_lut_idx < QSEED3LITE_SEPARABLE_LUTS) && + (scaler3_cfg->sep_len == QSEED3LITE_SEP_LUT_SIZE)) { + lut[0] = scaler3_cfg->sep_lut + + scaler3_cfg->y_rgb_sep_lut_idx * QSEED3LITE_LUT_SIZE; + config_lut = 1; + } + if (test_bit(QSEED3_COEF_LUT_UV_SEP_BIT, &lut_flags) && + (scaler3_cfg->uv_sep_lut_idx < QSEED3LITE_SEPARABLE_LUTS) && + (scaler3_cfg->sep_len == QSEED3LITE_SEP_LUT_SIZE)) { + lut[1] = scaler3_cfg->sep_lut + + scaler3_cfg->uv_sep_lut_idx * QSEED3LITE_LUT_SIZE; + config_lut = 1; + } + + if (config_lut) { + for (filter = 0; filter < QSEED3LITE_FILTERS; filter++) { + if (!lut[filter]) + continue; + lut_offset = 0; + lut_addr = QSEED3_COEF_LUT + offset + off_tbl[filter]; + for (j = 0; j < QSEED3LITE_LUT_SIZE; j++) { + DPU_REG_WRITE(c, + lut_addr, + (lut[filter])[lut_offset++]); + lut_addr += 4; + } + } + } + + if (test_bit(QSEED3_COEF_LUT_SWAP_BIT, &lut_flags)) + DPU_REG_WRITE(c, QSEED3_COEF_LUT_CTRL + offset, BIT(0)); + +} + static void _dpu_hw_setup_scaler3_de(struct dpu_hw_blk_reg_map *c, struct dpu_hw_scaler3_de_cfg *de_cfg, u32 offset) { @@ -242,9 +306,12 @@ void dpu_hw_setup_scaler3(struct dpu_hw_blk_reg_map *c, op_mode |= BIT(8); } - if (scaler3_cfg->lut_flag) - _dpu_hw_setup_scaler3_lut(c, scaler3_cfg, - scaler_offset); + if (scaler3_cfg->lut_flag) { + if (scaler_version < 0x2004) + _dpu_hw_setup_scaler3_lut(c, scaler3_cfg, scaler_offset); + else + _dpu_hw_setup_scaler3lite_lut(c, scaler3_cfg, scaler_offset); + } if (scaler_version == 0x1002) { phase_init = diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.h index 234eb7d65753..ff3cffde84cd 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_util.h @@ -97,6 +97,7 @@ struct dpu_hw_scaler3_de_cfg { * @ cir_lut: pointer to circular filter LUT * @ sep_lut: pointer to separable filter LUT * @ de: detail enhancer configuration + * @ dir_weight: Directional weight */ struct dpu_hw_scaler3_cfg { u32 enable; @@ -137,6 +138,8 @@ struct dpu_hw_scaler3_cfg { * Detail enhancer settings */ struct dpu_hw_scaler3_de_cfg de; + + u32 dir_weight; }; /** diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index bc0231a50132..f898a8f67b7f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -1465,6 +1465,7 @@ static int _dpu_plane_init_debugfs(struct drm_plane *plane) pdpu->debugfs_root, &pdpu->debugfs_src); if (cfg->features & BIT(DPU_SSPP_SCALER_QSEED3) || + cfg->features & BIT(DPU_SSPP_SCALER_QSEED3LITE) || cfg->features & BIT(DPU_SSPP_SCALER_QSEED2) || cfg->features & BIT(DPU_SSPP_SCALER_QSEED4)) { dpu_debugfs_setup_regset32(&pdpu->debugfs_scaler, From 33a7808ce1aea6e2edc1af25db25928137940c02 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 18 Jan 2021 17:15:58 +0100 Subject: [PATCH 070/506] drm/msm/dsi: Correct io_start for MSM8994 (20nm PHY) The previous registers were *almost* correct, but instead of PHYs, they were pointing at DSI PLLs, resulting in the PHY id autodetection failing miserably. Fixes: dcefc117cc19 ("drm/msm/dsi: Add support for msm8x94") Signed-off-by: Konrad Dybcio Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c index 1afb7c579dbb..eca86bf448f7 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c @@ -139,7 +139,7 @@ const struct msm_dsi_phy_cfg dsi_phy_20nm_cfgs = { .disable = dsi_20nm_phy_disable, .init = msm_dsi_phy_init_common, }, - .io_start = { 0xfd998300, 0xfd9a0300 }, + .io_start = { 0xfd998500, 0xfd9a0500 }, .num_dsi_phy = 2, }; From 68e4f01fddb4ead80e8c7084db489307f22c9cbb Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Wed, 27 Jan 2021 17:24:40 +0200 Subject: [PATCH 071/506] drm/msm/mdp5: Fix wait-for-commit for cmd panels Before the offending commit in msm_atomic_commit_tail wait_flush was called once per frame, after the commit was submitted. After it wait_flush is also called at the beginning to ensure previous potentially async commits are done. For cmd panels the source of wait_flush is a ping-pong irq notifying a completion. The completion needs to be notified with complete_all so multiple waiting parties (new async committers) can proceed. Signed-off-by: Iskren Chernev Suggested-by: Rob Clark Fixes: 2d99ced787e3d ("drm/msm: async commit support") Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c index 0c8f9f88301f..f5d71b274079 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c @@ -1180,7 +1180,7 @@ static void mdp5_crtc_pp_done_irq(struct mdp_irq *irq, uint32_t irqstatus) struct mdp5_crtc *mdp5_crtc = container_of(irq, struct mdp5_crtc, pp_done); - complete(&mdp5_crtc->pp_completion); + complete_all(&mdp5_crtc->pp_completion); } static void mdp5_crtc_wait_for_pp_done(struct drm_crtc *crtc) From 7a7cbf2a819740674455ad36155c662367261296 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 28 Jan 2021 13:03:30 -0800 Subject: [PATCH 072/506] drm/msm: Fix race of GPU init vs timestamp power management. We were using the same force-poweron bit in the two codepaths, so they could race to have one of them lose GPU power early. freedreno CI was seeing intermittent errors like: [drm:_a6xx_gmu_set_oob] *ERROR* Timeout waiting for GMU OOB set GPU_SET: 0x0 and this issue could have contributed to it. Signed-off-by: Eric Anholt Fixes: 4b565ca5a2cb ("drm/msm: Add A6XX device support") Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 25 ++++++++++++++++++++++--- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 8 ++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index e6703ae98760..b3318f86aabc 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -264,6 +264,16 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) } name = "GPU_SET"; break; + case GMU_OOB_PERFCOUNTER_SET: + if (gmu->legacy) { + request = GMU_OOB_PERFCOUNTER_REQUEST; + ack = GMU_OOB_PERFCOUNTER_ACK; + } else { + request = GMU_OOB_PERFCOUNTER_REQUEST_NEW; + ack = GMU_OOB_PERFCOUNTER_ACK_NEW; + } + name = "PERFCOUNTER"; + break; case GMU_OOB_BOOT_SLUMBER: request = GMU_OOB_BOOT_SLUMBER_REQUEST; ack = GMU_OOB_BOOT_SLUMBER_ACK; @@ -301,9 +311,14 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { if (!gmu->legacy) { - WARN_ON(state != GMU_OOB_GPU_SET); - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_GPU_SET_CLEAR_NEW); + if (state == GMU_OOB_GPU_SET) { + gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, + 1 << GMU_OOB_GPU_SET_CLEAR_NEW); + } else { + WARN_ON(state != GMU_OOB_PERFCOUNTER_SET); + gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, + 1 << GMU_OOB_PERFCOUNTER_CLEAR_NEW); + } return; } @@ -312,6 +327,10 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << GMU_OOB_GPU_SET_CLEAR); break; + case GMU_OOB_PERFCOUNTER_SET: + gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, + 1 << GMU_OOB_PERFCOUNTER_CLEAR); + break; case GMU_OOB_BOOT_SLUMBER: gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << GMU_OOB_BOOT_SLUMBER_CLEAR); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index c6d2bced8e5d..9fa278de2106 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -156,6 +156,7 @@ enum a6xx_gmu_oob_state { GMU_OOB_BOOT_SLUMBER = 0, GMU_OOB_GPU_SET, GMU_OOB_DCVS_SET, + GMU_OOB_PERFCOUNTER_SET, }; /* These are the interrupt / ack bits for each OOB request that are set @@ -190,6 +191,13 @@ enum a6xx_gmu_oob_state { #define GMU_OOB_GPU_SET_ACK_NEW 31 #define GMU_OOB_GPU_SET_CLEAR_NEW 31 +#define GMU_OOB_PERFCOUNTER_REQUEST 17 +#define GMU_OOB_PERFCOUNTER_ACK 25 +#define GMU_OOB_PERFCOUNTER_CLEAR 25 + +#define GMU_OOB_PERFCOUNTER_REQUEST_NEW 28 +#define GMU_OOB_PERFCOUNTER_ACK_NEW 30 +#define GMU_OOB_PERFCOUNTER_CLEAR_NEW 30 void a6xx_hfi_init(struct a6xx_gmu *gmu); int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 0a51cad42642..9be15cf169c9 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1172,12 +1172,12 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); /* Force the GPU power on so we can read this register */ - a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); + a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); *value = gpu_read64(gpu, REG_A6XX_RBBM_PERFCTR_CP_0_LO, REG_A6XX_RBBM_PERFCTR_CP_0_HI); - a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); + a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); return 0; } From 5f98b33b04c02c0d9088c7486c59d058696782f9 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 28 Jan 2021 13:03:31 -0800 Subject: [PATCH 073/506] drm/msm: Fix races managing the OOB state for timestamp vs timestamps. Now that we're not racing with GPU setup, also fix races of timestamps against other timestamps. In freedreno CI, we were seeing this path trigger timeouts on setting the GMU bit, producing: [drm:_a6xx_gmu_set_oob] *ERROR* Timeout waiting for GMU OOB set GPU_SET: 0x0 and this triggered especially on the first set of tests right after boot (it's probably easier to lose the race than one might think, given that we start many tests in parallel, and waiting for NFS to page in code probably means that lots of tests hit the same point of screen init at the same time). As of this patch, the message seems to have completely gone away. Signed-off-by: Eric Anholt Fixes: 4b565ca5a2cb ("drm/msm: Add A6XX device support") Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 9be15cf169c9..ba8e9d3cf0fe 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1170,6 +1170,9 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + static DEFINE_MUTEX(perfcounter_oob); + + mutex_lock(&perfcounter_oob); /* Force the GPU power on so we can read this register */ a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); @@ -1178,6 +1181,7 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) REG_A6XX_RBBM_PERFCTR_CP_0_HI); a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); + mutex_unlock(&perfcounter_oob); return 0; } From 555c50a4a19b28bee81ad257e0cdc0fcf6e8dab5 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 28 Jan 2021 13:03:32 -0800 Subject: [PATCH 074/506] drm/msm: Clean up GMU OOB set/clear handling. Now that the bug is fixed in the minimal way for stable, go make the code table-driven. Signed-off-by: Eric Anholt Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 126 +++++++++++++------------- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 55 +++-------- 2 files changed, 78 insertions(+), 103 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index b3318f86aabc..9066e98eb8ef 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -245,47 +245,66 @@ static int a6xx_gmu_hfi_start(struct a6xx_gmu *gmu) return ret; } +struct a6xx_gmu_oob_bits { + int set, ack, set_new, ack_new; + const char *name; +}; + +/* These are the interrupt / ack bits for each OOB request that are set + * in a6xx_gmu_set_oob and a6xx_clear_oob + */ +static const struct a6xx_gmu_oob_bits a6xx_gmu_oob_bits[] = { + [GMU_OOB_GPU_SET] = { + .name = "GPU_SET", + .set = 16, + .ack = 24, + .set_new = 30, + .ack_new = 31, + }, + + [GMU_OOB_PERFCOUNTER_SET] = { + .name = "PERFCOUNTER", + .set = 17, + .ack = 25, + .set_new = 28, + .ack_new = 30, + }, + + [GMU_OOB_BOOT_SLUMBER] = { + .name = "BOOT_SLUMBER", + .set = 22, + .ack = 30, + }, + + [GMU_OOB_DCVS_SET] = { + .name = "GPU_DCVS", + .set = 23, + .ack = 31, + }, +}; + /* Trigger a OOB (out of band) request to the GMU */ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { int ret; u32 val; int request, ack; - const char *name; - switch (state) { - case GMU_OOB_GPU_SET: - if (gmu->legacy) { - request = GMU_OOB_GPU_SET_REQUEST; - ack = GMU_OOB_GPU_SET_ACK; - } else { - request = GMU_OOB_GPU_SET_REQUEST_NEW; - ack = GMU_OOB_GPU_SET_ACK_NEW; - } - name = "GPU_SET"; - break; - case GMU_OOB_PERFCOUNTER_SET: - if (gmu->legacy) { - request = GMU_OOB_PERFCOUNTER_REQUEST; - ack = GMU_OOB_PERFCOUNTER_ACK; - } else { - request = GMU_OOB_PERFCOUNTER_REQUEST_NEW; - ack = GMU_OOB_PERFCOUNTER_ACK_NEW; - } - name = "PERFCOUNTER"; - break; - case GMU_OOB_BOOT_SLUMBER: - request = GMU_OOB_BOOT_SLUMBER_REQUEST; - ack = GMU_OOB_BOOT_SLUMBER_ACK; - name = "BOOT_SLUMBER"; - break; - case GMU_OOB_DCVS_SET: - request = GMU_OOB_DCVS_REQUEST; - ack = GMU_OOB_DCVS_ACK; - name = "GPU_DCVS"; - break; - default: + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) return -EINVAL; + + if (gmu->legacy) { + request = a6xx_gmu_oob_bits[state].set; + ack = a6xx_gmu_oob_bits[state].ack; + } else { + request = a6xx_gmu_oob_bits[state].set_new; + ack = a6xx_gmu_oob_bits[state].ack_new; + if (!request || !ack) { + DRM_DEV_ERROR(gmu->dev, + "Invalid non-legacy GMU request %s\n", + a6xx_gmu_oob_bits[state].name); + return -EINVAL; + } } /* Trigger the equested OOB operation */ @@ -298,7 +317,7 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) if (ret) DRM_DEV_ERROR(gmu->dev, "Timeout waiting for GMU OOB set %s: 0x%x\n", - name, + a6xx_gmu_oob_bits[state].name, gmu_read(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO)); /* Clear the acknowledge interrupt */ @@ -310,36 +329,17 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) /* Clear a pending OOB state in the GMU */ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { - if (!gmu->legacy) { - if (state == GMU_OOB_GPU_SET) { - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_GPU_SET_CLEAR_NEW); - } else { - WARN_ON(state != GMU_OOB_PERFCOUNTER_SET); - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_PERFCOUNTER_CLEAR_NEW); - } - return; - } + int bit; - switch (state) { - case GMU_OOB_GPU_SET: - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_GPU_SET_CLEAR); - break; - case GMU_OOB_PERFCOUNTER_SET: - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_PERFCOUNTER_CLEAR); - break; - case GMU_OOB_BOOT_SLUMBER: - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_BOOT_SLUMBER_CLEAR); - break; - case GMU_OOB_DCVS_SET: - gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, - 1 << GMU_OOB_DCVS_CLEAR); - break; - } + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) + return; + + if (gmu->legacy) + bit = a6xx_gmu_oob_bits[state].ack; + else + bit = a6xx_gmu_oob_bits[state].ack_new; + + gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, bit); } /* Enable CPU control of SPTP power power collapse */ diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 9fa278de2106..71dfa60070cc 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -153,52 +153,27 @@ static inline void gmu_write_rscc(struct a6xx_gmu *gmu, u32 offset, u32 value) */ enum a6xx_gmu_oob_state { + /* + * Let the GMU know that a boot or slumber operation has started. The value in + * REG_A6XX_GMU_BOOT_SLUMBER_OPTION lets the GMU know which operation we are + * doing + */ GMU_OOB_BOOT_SLUMBER = 0, + /* + * Let the GMU know to not turn off any GPU registers while the CPU is in a + * critical section + */ GMU_OOB_GPU_SET, + /* + * Set a new power level for the GPU when the CPU is doing frequency scaling + */ GMU_OOB_DCVS_SET, + /* + * Used to keep the GPU on for CPU-side reads of performance counters. + */ GMU_OOB_PERFCOUNTER_SET, }; -/* These are the interrupt / ack bits for each OOB request that are set - * in a6xx_gmu_set_oob and a6xx_clear_oob - */ - -/* - * Let the GMU know that a boot or slumber operation has started. The value in - * REG_A6XX_GMU_BOOT_SLUMBER_OPTION lets the GMU know which operation we are - * doing - */ -#define GMU_OOB_BOOT_SLUMBER_REQUEST 22 -#define GMU_OOB_BOOT_SLUMBER_ACK 30 -#define GMU_OOB_BOOT_SLUMBER_CLEAR 30 - -/* - * Set a new power level for the GPU when the CPU is doing frequency scaling - */ -#define GMU_OOB_DCVS_REQUEST 23 -#define GMU_OOB_DCVS_ACK 31 -#define GMU_OOB_DCVS_CLEAR 31 - -/* - * Let the GMU know to not turn off any GPU registers while the CPU is in a - * critical section - */ -#define GMU_OOB_GPU_SET_REQUEST 16 -#define GMU_OOB_GPU_SET_ACK 24 -#define GMU_OOB_GPU_SET_CLEAR 24 - -#define GMU_OOB_GPU_SET_REQUEST_NEW 30 -#define GMU_OOB_GPU_SET_ACK_NEW 31 -#define GMU_OOB_GPU_SET_CLEAR_NEW 31 - -#define GMU_OOB_PERFCOUNTER_REQUEST 17 -#define GMU_OOB_PERFCOUNTER_ACK 25 -#define GMU_OOB_PERFCOUNTER_CLEAR 25 - -#define GMU_OOB_PERFCOUNTER_REQUEST_NEW 28 -#define GMU_OOB_PERFCOUNTER_ACK_NEW 30 -#define GMU_OOB_PERFCOUNTER_CLEAR_NEW 30 - void a6xx_hfi_init(struct a6xx_gmu *gmu); int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state); void a6xx_hfi_stop(struct a6xx_gmu *gmu); From 6ec9351809612fa1c0256fb3e39b49b6100e2983 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 28 Jan 2021 09:09:29 -0800 Subject: [PATCH 075/506] drm/msm/kms: Make a lock_class_key for each crtc mutex Lockdep complains about an AA deadlock when rebooting the device. base-commit: 19c329f6808995b142b3966301f217c831e7cf31 ============================================ WARNING: possible recursive locking detected 5.4.91 #1 Not tainted -------------------------------------------- reboot/5213 is trying to acquire lock: ffffff80d13391b0 (&kms->commit_lock[i]){+.+.}, at: lock_crtcs+0x60/0xa4 but task is already holding lock: ffffff80d1339110 (&kms->commit_lock[i]){+.+.}, at: lock_crtcs+0x60/0xa4 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&kms->commit_lock[i]); lock(&kms->commit_lock[i]); *** DEADLOCK *** May be due to missing lock nesting notation 6 locks held by reboot/5213: __arm64_sys_reboot+0x148/0x2a0 device_shutdown+0x10c/0x2c4 drm_atomic_helper_shutdown+0x48/0xfc modeset_lock+0x120/0x24c lock_crtcs+0x60/0xa4 stack backtrace: CPU: 4 PID: 5213 Comm: reboot Not tainted 5.4.91 #1 Hardware name: Google Pompom (rev1) with LTE (DT) Call trace: dump_backtrace+0x0/0x1dc show_stack+0x24/0x30 dump_stack+0xfc/0x1a8 __lock_acquire+0xcd0/0x22b8 lock_acquire+0x1ec/0x240 __mutex_lock_common+0xe0/0xc84 mutex_lock_nested+0x48/0x58 lock_crtcs+0x60/0xa4 msm_atomic_commit_tail+0x348/0x570 commit_tail+0xdc/0x178 drm_atomic_helper_commit+0x160/0x168 drm_atomic_commit+0x68/0x80 This is because lockdep thinks all the locks taken in lock_crtcs() are the same lock, when they actually aren't. That's because we call mutex_init() in msm_kms_init() and that assigns one static key for every lock initialized in this loop. Let's allocate a dynamic number of lock_class_keys and assign them to each lock so that lockdep can figure out an AA deadlock isn't possible here. Fixes: b3d91800d9ac ("drm/msm: Fix race condition in msm driver with async layer updates") Cc: Krishna Manikandan Signed-off-by: Stephen Boyd Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_kms.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_kms.h b/drivers/gpu/drm/msm/msm_kms.h index d8151a89e163..4735251a394d 100644 --- a/drivers/gpu/drm/msm/msm_kms.h +++ b/drivers/gpu/drm/msm/msm_kms.h @@ -157,6 +157,7 @@ struct msm_kms { * from the crtc's pending_timer close to end of the frame: */ struct mutex commit_lock[MAX_CRTCS]; + struct lock_class_key commit_lock_keys[MAX_CRTCS]; unsigned pending_crtc_mask; struct msm_pending_timer pending_timers[MAX_CRTCS]; }; @@ -166,8 +167,11 @@ static inline int msm_kms_init(struct msm_kms *kms, { unsigned i, ret; - for (i = 0; i < ARRAY_SIZE(kms->commit_lock); i++) - mutex_init(&kms->commit_lock[i]); + for (i = 0; i < ARRAY_SIZE(kms->commit_lock); i++) { + lockdep_register_key(&kms->commit_lock_keys[i]); + __mutex_init(&kms->commit_lock[i], "&kms->commit_lock[i]", + &kms->commit_lock_keys[i]); + } kms->funcs = funcs; From dd5d08b5e54d54563d2de0ef6aa959e47485d7da Mon Sep 17 00:00:00 2001 From: Jiapeng Zhong Date: Tue, 26 Jan 2021 17:51:19 +0800 Subject: [PATCH 076/506] drm/msm: remove redundant NULL check Fix below warnings reported by coccicheck: ./drivers/gpu/drm/msm/msm_gem.c:991:3-9: WARNING: NULL check before some freeing functions is not needed. Reported-by: Abaci Robot Signed-off-by: Jiapeng Zhong Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 9d10739c4eb2..4f1f1a09ca89 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -987,8 +987,7 @@ void msm_gem_free_object(struct drm_gem_object *obj) /* Don't drop the pages for imported dmabuf, as they are not * ours, just free the array we allocated: */ - if (msm_obj->pages) - kvfree(msm_obj->pages); + kvfree(msm_obj->pages); put_iova_vmas(obj); From 0ac8924511434f1b9d83e1b666e43d628bd84f48 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 29 Jan 2021 09:30:29 +0000 Subject: [PATCH 077/506] drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c: Remove unneeded semicolon fix semicolon.cocci warnings: drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c:752:2-3: Unneeded semicolon Signed-off-by: Xu Wang Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c index 374b0e8471e6..5a8e3e1fc48c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c @@ -749,7 +749,7 @@ static void _dpu_kms_set_encoder_mode(struct msm_kms *kms, case DRM_MODE_ENCODER_TMDS: info.num_of_h_tiles = 1; break; - }; + } rc = dpu_encoder_setup(encoder->dev, encoder, &info); if (rc) From a014abfec5413d4625a089bbed45a424b5d4d24a Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 29 Jan 2021 09:44:16 +0000 Subject: [PATCH 078/506] drm/msm/dp/dp_ctrl: Remove unneeded semicolon fix semicolon.cocci warnings: drivers/gpu/drm/msm/dp/dp_ctrl.c:1161:2-3: Unneeded semicolon Signed-off-by: Xu Wang Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index 5ac155d6b85f..55b7d0edffbf 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -1158,7 +1158,7 @@ static int dp_ctrl_link_rate_down_shift(struct dp_ctrl_private *ctrl) default: ret = -EINVAL; break; - }; + } if (!ret) DRM_DEBUG_DP("new rate=0x%x\n", ctrl->link->link_params.rate); From c703d5789590935c573bbd080a2166b72d51a017 Mon Sep 17 00:00:00 2001 From: Judy Hsiao Date: Fri, 29 Jan 2021 18:16:03 +0800 Subject: [PATCH 079/506] drm/msm/dp: trigger unplug event in msm_dp_display_disable 1. Trigger the unplug event in msm_dp_display_disable() to shutdown audio properly. 2. Reset the completion before signal the disconnect event. Fixes: 158b9aa74479 ("drm/msm/dp: wait for audio notification before disabling clocks") Reviewed-by: Stephen Boyd Tested-by: Stephen Boyd Signed-off-by: Judy Hsiao Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_display.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 3a7e6c5adbf5..d9216f80cdad 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -651,8 +651,8 @@ static int dp_hpd_unplug_handle(struct dp_display_private *dp, u32 data) dp_add_event(dp, EV_DISCONNECT_PENDING_TIMEOUT, 0, DP_TIMEOUT_5_SECOND); /* signal the disconnect event early to ensure proper teardown */ - dp_display_handle_plugged_change(g_dp_display, false); reinit_completion(&dp->audio_comp); + dp_display_handle_plugged_change(g_dp_display, false); dp_catalog_hpd_config_intr(dp->catalog, DP_DP_HPD_PLUG_INT_MASK | DP_DP_IRQ_HPD_INT_MASK, true); @@ -897,6 +897,9 @@ static int dp_display_disable(struct dp_display_private *dp, u32 data) /* wait only if audio was enabled */ if (dp_display->audio_enabled) { + /* signal the disconnect event */ + reinit_completion(&dp->audio_comp); + dp_display_handle_plugged_change(dp_display, false); if (!wait_for_completion_timeout(&dp->audio_comp, HZ * 5)) DRM_ERROR("audio comp timeout\n"); From 8c71139d9f84c1963b0a416941244502a20a7e52 Mon Sep 17 00:00:00 2001 From: Calum Mackay Date: Sat, 24 Oct 2020 22:36:38 +0100 Subject: [PATCH 080/506] SUNRPC: correct error code comment in xs_tcp_setup_socket() This comment was introduced by commit 6ea44adce915 ("SUNRPC: ensure correct error is reported by xs_tcp_setup_socket()"). I believe EIO was a typo at the time: it should have been EAGAIN. Subsequently, commit 0445f92c5d53 ("SUNRPC: Fix disconnection races") changed that to ENOTCONN. Rather than trying to keep the comment here in sync with the code in xprt_force_disconnect(), make the point in a non-specific way. Fixes: 6ea44adce915 ("SUNRPC: ensure correct error is reported by xs_tcp_setup_socket()") Signed-off-by: Calum Mackay Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c56a66cdf4ac..2e84da64200e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2276,10 +2276,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: - /* - * xs_tcp_force_close() wakes tasks with -EIO. - * We need to wake them first to ensure the - * correct error code. + /* xs_tcp_force_close() wakes tasks with a fixed error code. + * We need to wake them first to ensure the correct error code. */ xprt_wake_pending_tasks(xprt, status); xs_tcp_force_close(xprt); From 12b20ce34933c007cc850151cadf9c8c585c8b8d Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 9 Nov 2020 15:24:05 +0530 Subject: [PATCH 081/506] net: sunrpc: xprtsock.c: Corrected few spellings ,in comments Few trivial and rudimentary spell corrections. Signed-off-by: Bhaskar Chowdhury Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2e84da64200e..e1ed21d45cdb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -829,7 +829,7 @@ xs_stream_record_marker(struct xdr_buf *xdr) * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again - * other: Some other error occured, the request was not sent + * other: Some other error occurred, the request was not sent */ static int xs_local_send_request(struct rpc_rqst *req) { @@ -1665,7 +1665,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * This ensures that we can continue to establish TCP * connections even when all local ephemeral ports are already * a part of some TCP connection. This makes no difference - * for UDP sockets, but also doens't harm them. + * for UDP sockets, but also doesn't harm them. * * If we're asking for any reserved port (i.e. port == 0 && * transport->xprt.resvport == 1) xs_get_srcport above will @@ -2378,7 +2378,7 @@ static void xs_error_handle(struct work_struct *work) } /** - * xs_local_print_stats - display AF_LOCAL socket-specifc stats + * xs_local_print_stats - display AF_LOCAL socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2407,7 +2407,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_udp_print_stats - display UDP socket-specifc stats + * xs_udp_print_stats - display UDP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2431,7 +2431,7 @@ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_tcp_print_stats - display TCP socket-specifc stats + * xs_tcp_print_stats - display TCP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * From c0da04ae079fb4979b921f8b46a6eeb15a4f803c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 9 Nov 2020 03:12:03 -0500 Subject: [PATCH 082/506] fs/nfs: remove duplicate include 'nfs42.h' is already included above and can be removed here. Signed-off-by: Menglong Dong Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2f4679a62712..224773eb61b1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,10 +71,6 @@ #include "nfs4trace.h" -#ifdef CONFIG_NFS_V4_2 -#include "nfs42.h" -#endif /* CONFIG_NFS_V4_2 */ - #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_BITMASK_SZ 3 From 5c191fef4ce2fc4310d8174bd3861474919201e4 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Sat, 9 Jan 2021 14:51:08 +0100 Subject: [PATCH 083/506] drm/msm/dsi_pll_10nm: Fix dividing the same numbers twice In function dsi_pll_calc_dec_frac we are calculating the decimal div start parameter by dividing the decimal multiple by the fractional multiplier: the remainder of that operation is stored to then get programmed to the fractional divider registers of the PLL. It's useless to call div_u64_rem to get the remainder and *then* call div_u64 to get the division result, as the first is already giving that result: let's fix it by just caring about the result of div_u64_rem. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c index e4e9bf04b736..2820b3118720 100644 --- a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c +++ b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c @@ -172,9 +172,7 @@ static void dsi_pll_calc_dec_frac(struct dsi_pll_10nm *pll) multiplier = 1 << config->frac_bits; dec_multiple = div_u64(pll_freq * multiplier, divider); - div_u64_rem(dec_multiple, multiplier, &frac); - - dec = div_u64(dec_multiple, multiplier); + dec = div_u64_rem(dec_multiple, multiplier, &frac); if (pll_freq <= 1900000000UL) regs->pll_prop_gain_rate = 8; From 196145eb1af18efb3fc6dd1e40cfc4dc47e47603 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Sat, 9 Jan 2021 14:51:09 +0100 Subject: [PATCH 084/506] drm/msm/dsi_pll_10nm: Solve TODO for multiplier frac_bits assignment The number of fractional registers bits is known and already set in the frac_bits variable of the dsi_pll_config struct here in 10nm: remove the TODO by simply using that variable. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c index 2820b3118720..72306a86c41f 100644 --- a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c +++ b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c @@ -481,6 +481,7 @@ static unsigned long dsi_pll_10nm_vco_recalc_rate(struct clk_hw *hw, { struct msm_dsi_pll *pll = hw_clk_to_pll(hw); struct dsi_pll_10nm *pll_10nm = to_pll_10nm(pll); + struct dsi_pll_config *config = &pll_10nm->pll_configuration; void __iomem *base = pll_10nm->mmio; u64 ref_clk = pll_10nm->vco_ref_clk_rate; u64 vco_rate = 0x0; @@ -501,9 +502,8 @@ static unsigned long dsi_pll_10nm_vco_recalc_rate(struct clk_hw *hw, /* * TODO: * 1. Assumes prescaler is disabled - * 2. Multiplier is 2^18. it should be 2^(num_of_frac_bits) */ - multiplier = 1 << 18; + multiplier = 1 << config->frac_bits; pll_freq = dec * (ref_clk * 2); tmp64 = (ref_clk * 2 * frac); pll_freq += div_u64(tmp64, multiplier); From 362cadf34b9fe030f271ded38d5062eac12f3340 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Sat, 9 Jan 2021 14:51:11 +0100 Subject: [PATCH 085/506] drm/msm/dsi_pll_10nm: Fix variable usage for pll_lockdet_rate The PLL_LOCKDET_RATE_1 was being programmed with a hardcoded value directly, but the same value was also being specified in the dsi_pll_regs struct pll_lockdet_rate variable: let's use it! Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c index 72306a86c41f..2e1cb4154c0f 100644 --- a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c +++ b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c @@ -304,7 +304,8 @@ static void dsi_pll_commit(struct dsi_pll_10nm *pll) reg->frac_div_start_mid); pll_write(base + REG_DSI_10nm_PHY_PLL_FRAC_DIV_START_HIGH_1, reg->frac_div_start_high); - pll_write(base + REG_DSI_10nm_PHY_PLL_PLL_LOCKDET_RATE_1, 0x40); + pll_write(base + REG_DSI_10nm_PHY_PLL_PLL_LOCKDET_RATE_1, + reg->pll_lockdet_rate); pll_write(base + REG_DSI_10nm_PHY_PLL_PLL_LOCK_DELAY, 0x06); pll_write(base + REG_DSI_10nm_PHY_PLL_CMODE, 0x10); pll_write(base + REG_DSI_10nm_PHY_PLL_CLOCK_INVERTERS, From ce5226625a694fa5a46db241ac4c77a1430e37fe Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Sat, 9 Jan 2021 14:51:12 +0100 Subject: [PATCH 086/506] drm/msm/dsi_pll_10nm: Convert pr_err prints to DRM_DEV_ERROR DRM_DEV_ERROR should be used across this entire source: convert the pr_err prints to the first as a cleanup. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c index 2e1cb4154c0f..de3b802ccd3d 100644 --- a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c +++ b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c @@ -344,6 +344,7 @@ static int dsi_pll_10nm_vco_set_rate(struct clk_hw *hw, unsigned long rate, static int dsi_pll_10nm_lock_status(struct dsi_pll_10nm *pll) { + struct device *dev = &pll->pdev->dev; int rc; u32 status = 0; u32 const delay_us = 100; @@ -356,8 +357,8 @@ static int dsi_pll_10nm_lock_status(struct dsi_pll_10nm *pll) delay_us, timeout_us); if (rc) - pr_err("DSI PLL(%d) lock failed, status=0x%08x\n", - pll->id, status); + DRM_DEV_ERROR(dev, "DSI PLL(%d) lock failed, status=0x%08x\n", + pll->id, status); return rc; } @@ -404,6 +405,7 @@ static int dsi_pll_10nm_vco_prepare(struct clk_hw *hw) { struct msm_dsi_pll *pll = hw_clk_to_pll(hw); struct dsi_pll_10nm *pll_10nm = to_pll_10nm(pll); + struct device *dev = &pll_10nm->pdev->dev; int rc; dsi_pll_enable_pll_bias(pll_10nm); @@ -412,7 +414,7 @@ static int dsi_pll_10nm_vco_prepare(struct clk_hw *hw) rc = dsi_pll_10nm_vco_set_rate(hw,pll_10nm->vco_current_rate, 0); if (rc) { - pr_err("vco_set_rate failed, rc=%d\n", rc); + DRM_DEV_ERROR(dev, "vco_set_rate failed, rc=%d\n", rc); return rc; } @@ -429,7 +431,7 @@ static int dsi_pll_10nm_vco_prepare(struct clk_hw *hw) /* Check for PLL lock */ rc = dsi_pll_10nm_lock_status(pll_10nm); if (rc) { - pr_err("PLL(%d) lock failed\n", pll_10nm->id); + DRM_DEV_ERROR(dev, "PLL(%d) lock failed\n", pll_10nm->id); goto error; } From 93f479d3ad05497f29f2bed58e4a6c6a4f0a548c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:40:02 -0600 Subject: [PATCH 087/506] SUNRPC: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix multiple warnings by explicitly adding multiple break statements instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- net/sunrpc/rpc_pipe.c | 1 + net/sunrpc/xprtsock.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 8241f5a4a01c..09c000d490a1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -478,6 +478,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); + break; default: break; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e1ed21d45cdb..e35760f238a4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1875,6 +1875,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); + break; case -ENOBUFS: break; case -ENOENT: From ffb81717a166b3c4a676ada61283b3121448e503 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:26:46 -0600 Subject: [PATCH 088/506] nfs: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix multiple warnings by explicitly add multiple break/goto/return/fallthrough statements instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- fs/nfs/nfs3acl.c | 1 + fs/nfs/nfs4client.c | 1 + fs/nfs/nfs4proc.c | 2 ++ fs/nfs/nfs4state.c | 1 + fs/nfs/pnfs.c | 2 ++ 5 files changed, 7 insertions(+) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index c6c863382f37..68e206cf4c4e 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; + goto getout; default: goto getout; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86acffe7335c..889a9f4c0310 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -609,6 +609,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * changed. Schedule recovery! */ nfs4_schedule_path_down_recovery(pos); + goto out; default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 224773eb61b1..34570a7785fa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2227,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct default: printk(KERN_ERR "NFS: %s: unhandled error " "%d.\n", __func__, err); + fallthrough; case 0: case -ENOENT: case -EAGAIN: @@ -9701,6 +9702,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; + break; case 0: break; default: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4bf10792cb5b..3a51351bdc6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); + return; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index af64b4e6fd1f..102b66e0bdef 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: From 49dee70052b89498cc0fc61e0e193cefeee40989 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 28 Jan 2021 09:54:59 -0500 Subject: [PATCH 089/506] NFS: Clean up nfs_readpage() and nfs_readpages() In prep for the new fscache netfs API, refactor nfs_readpage() and nfs_readpages() for future patches. No functional change. Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb854f1f86e2..dd92156e27c5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -314,7 +314,7 @@ int nfs_readpage(struct file *file, struct page *page) { struct nfs_open_context *ctx; struct inode *inode = page_file_mapping(page)->host; - int error; + int ret; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_SIZE, page_index(page)); @@ -328,18 +328,18 @@ int nfs_readpage(struct file *file, struct page *page) * be any new pending writes generated at this point * for this page (other pages can be written to). */ - error = nfs_wb_page(inode, page); - if (error) + ret = nfs_wb_page(inode, page); + if (ret) goto out_unlock; if (PageUptodate(page)) goto out_unlock; - error = -ESTALE; + ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; if (file == NULL) { - error = -EBADF; + ret = -EBADF; ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (ctx == NULL) goto out_unlock; @@ -347,24 +347,24 @@ int nfs_readpage(struct file *file, struct page *page) ctx = get_nfs_open_context(nfs_file_open_context(file)); if (!IS_SYNC(inode)) { - error = nfs_readpage_from_fscache(ctx, inode, page); - if (error == 0) + ret = nfs_readpage_from_fscache(ctx, inode, page); + if (ret == 0) goto out; } xchg(&ctx->error, 0); - error = nfs_readpage_async(ctx, inode, page); - if (!error) { - error = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !error) - error = xchg(&ctx->error, 0); + ret = nfs_readpage_async(ctx, inode, page); + if (!ret) { + ret = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !ret) + ret = xchg(&ctx->error, 0); } out: put_nfs_open_context(ctx); - return error; + return ret; out_unlock: unlock_page(page); - return error; + return ret; } struct nfs_readdesc { @@ -404,17 +404,15 @@ readpage_async_filler(void *data, struct page *page) return error; } -int nfs_readpages(struct file *filp, struct address_space *mapping, +int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct nfs_pageio_descriptor pgio; struct nfs_pgio_mirror *pgm; - struct nfs_readdesc desc = { - .pgio = &pgio, - }; + struct nfs_readdesc desc; struct inode *inode = mapping->host; unsigned long npages; - int ret = -ESTALE; + int ret; dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, @@ -422,15 +420,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + ret = -ESTALE; if (NFS_STALE(inode)) goto out; - if (filp == NULL) { + if (file == NULL) { + ret = -EBADF; desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (desc.ctx == NULL) - return -EBADF; + goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); /* attempt to read as many of the pages as possible from the cache * - this returns -ENOBUFS immediately if the cookie is negative @@ -440,6 +440,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ + desc.pgio = &pgio; nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); From 6ddfd213f4ea22ac955bcd82100c57cd294494d2 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 28 Jan 2021 09:55:00 -0500 Subject: [PATCH 090/506] NFS: In nfs_readpage() only increment NFSIOS_READPAGES when read succeeds There is a small inconsistency with nfs_readpage() vs nfs_readpages() with regards to NFSIOS_READPAGES. In readpage we unconditionally increment NFSIOS_READPAGES at the top, which means even if the read fails. In readpages, we increment NFSIOS_READPAGES at the bottom based on how many pages were successfully read. Change readpage to be consistent with readpages and so NFSIOS_READPAGES only reflects successful, non-fscache reads. Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index dd92156e27c5..464077daf62f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -319,7 +319,6 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. @@ -359,6 +358,7 @@ int nfs_readpage(struct file *file, struct page *page) if (!PageUptodate(page) && !ret) ret = xchg(&ctx->error, 0); } + nfs_add_stats(inode, NFSIOS_READPAGES, 1); out: put_nfs_open_context(ctx); return ret; From 1af7e7f8c12f521c111bd7cf0d138be7e15b51a5 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 28 Jan 2021 09:55:01 -0500 Subject: [PATCH 091/506] NFS: Refactor nfs_readpage() and nfs_readpage_async() to use nfs_readdesc Both nfs_readpage() and nfs_readpages() use similar code. This patch should be no functional change, and refactors nfs_readpage_async() to use nfs_readdesc to enable future merging of nfs_readpage_async() and nfs_readpage_async_filler(). Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 62 ++++++++++++++++++++---------------------- include/linux/nfs_fs.h | 3 +- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 464077daf62f..8c05e56dab65 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -114,18 +114,23 @@ static void nfs_readpage_release(struct nfs_page *req, int error) nfs_release_request(req); } -int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, +struct nfs_readdesc { + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; +}; + +int nfs_readpage_async(void *data, struct inode *inode, struct page *page) { + struct nfs_readdesc *desc = data; struct nfs_page *new; unsigned int len; - struct nfs_pageio_descriptor pgio; struct nfs_pgio_mirror *pgm; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, 0, len); + new = nfs_create_request(desc->ctx, page, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -133,21 +138,21 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_SIZE) zero_user_segment(page, len, PAGE_SIZE); - nfs_pageio_init_read(&pgio, inode, false, + nfs_pageio_init_read(&desc->pgio, inode, false, &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&pgio, new)) { + if (!nfs_pageio_add_request(&desc->pgio, new)) { nfs_list_remove_request(new); - nfs_readpage_release(new, pgio.pg_error); + nfs_readpage_release(new, desc->pgio.pg_error); } - nfs_pageio_complete(&pgio); + nfs_pageio_complete(&desc->pgio); /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); + WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1); - pgm = &pgio.pg_mirrors[0]; + pgm = &desc->pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - return pgio.pg_error < 0 ? pgio.pg_error : 0; + return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0; } static void nfs_page_group_set_uptodate(struct nfs_page *req) @@ -312,7 +317,7 @@ static void nfs_readpage_result(struct rpc_task *task, */ int nfs_readpage(struct file *file, struct page *page) { - struct nfs_open_context *ctx; + struct nfs_readdesc desc; struct inode *inode = page_file_mapping(page)->host; int ret; @@ -339,39 +344,34 @@ int nfs_readpage(struct file *file, struct page *page) if (file == NULL) { ret = -EBADF; - ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (ctx == NULL) + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) goto out_unlock; } else - ctx = get_nfs_open_context(nfs_file_open_context(file)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); if (!IS_SYNC(inode)) { - ret = nfs_readpage_from_fscache(ctx, inode, page); + ret = nfs_readpage_from_fscache(desc.ctx, inode, page); if (ret == 0) goto out; } - xchg(&ctx->error, 0); - ret = nfs_readpage_async(ctx, inode, page); + xchg(&desc.ctx->error, 0); + ret = nfs_readpage_async(&desc, inode, page); if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) - ret = xchg(&ctx->error, 0); + ret = xchg(&desc.ctx->error, 0); } nfs_add_stats(inode, NFSIOS_READPAGES, 1); out: - put_nfs_open_context(ctx); + put_nfs_open_context(desc.ctx); return ret; out_unlock: unlock_page(page); return ret; } -struct nfs_readdesc { - struct nfs_pageio_descriptor *pgio; - struct nfs_open_context *ctx; -}; - static int readpage_async_filler(void *data, struct page *page) { @@ -390,9 +390,9 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_SIZE) zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(desc->pgio, new)) { + if (!nfs_pageio_add_request(&desc->pgio, new)) { nfs_list_remove_request(new); - error = desc->pgio->pg_error; + error = desc->pgio.pg_error; nfs_readpage_release(new, error); goto out; } @@ -407,7 +407,6 @@ readpage_async_filler(void *data, struct page *page) int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct nfs_pageio_descriptor pgio; struct nfs_pgio_mirror *pgm; struct nfs_readdesc desc; struct inode *inode = mapping->host; @@ -440,17 +439,16 @@ int nfs_readpages(struct file *file, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - desc.pgio = &pgio; - nfs_pageio_init_read(&pgio, inode, false, + nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); + nfs_pageio_complete(&desc.pgio); /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); + WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1); - pgm = &pgio.pg_mirrors[0]; + pgm = &desc.pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 681ed98e4ba8..cb0248a34518 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -570,8 +570,7 @@ nfs_have_writebacks(struct inode *inode) extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); -extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, - struct page *); +extern int nfs_readpage_async(void *, struct inode *, struct page *); /* * inline functions From 0c119e3a18f994251c74c751e1657e4ef8da0c00 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 28 Jan 2021 09:55:02 -0500 Subject: [PATCH 092/506] NFS: Call readpage_async_filler() from nfs_readpage_async() Refactor slightly so nfs_readpage_async() calls into readpage_async_filler(). Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8c05e56dab65..0ed79e6bc486 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -119,31 +119,22 @@ struct nfs_readdesc { struct nfs_open_context *ctx; }; +static int readpage_async_filler(void *data, struct page *page); + int nfs_readpage_async(void *data, struct inode *inode, struct page *page) { struct nfs_readdesc *desc = data; - struct nfs_page *new; - unsigned int len; struct nfs_pgio_mirror *pgm; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, page, 0, len); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); - } - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + int error; nfs_pageio_init_read(&desc->pgio, inode, false, &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&desc->pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new, desc->pgio.pg_error); - } + + error = readpage_async_filler(desc, page); + if (error) + goto out; + nfs_pageio_complete(&desc->pgio); /* It doesn't make sense to do mirrored reads! */ @@ -153,6 +144,9 @@ int nfs_readpage_async(void *data, struct inode *inode, NFS_I(inode)->read_io += pgm->pg_bytes_written; return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0; + +out: + return error; } static void nfs_page_group_set_uptodate(struct nfs_page *req) From 1e83b173b2663b7d357309584678cf24787f0713 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 28 Jan 2021 09:55:03 -0500 Subject: [PATCH 093/506] NFS: Add nfs_pageio_complete_read() and remove nfs_readpage_async() Add nfs_pageio_complete_read() and call this from both nfs_readpage() and nfs_readpages(), since the submission and accounting is the same for both functions. Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/fscache.c | 4 -- fs/nfs/read.c | 137 ++++++++++++++++++----------------------- include/linux/nfs_fs.h | 1 - 3 files changed, 61 insertions(+), 81 deletions(-) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index a60df88efc40..c4c021c6ebbd 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page, if (!error) { SetPageUptodate(page); unlock_page(page); - } else { - error = nfs_readpage_async(context, page->mapping->host, page); - if (error) - unlock_page(page); } } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0ed79e6bc486..d2b6dce1f99f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct nfs_pgio_mirror *pgm; + unsigned long npages; + + nfs_pageio_complete(pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + pgm = &pgio->pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +} + + void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; @@ -119,36 +137,6 @@ struct nfs_readdesc { struct nfs_open_context *ctx; }; -static int readpage_async_filler(void *data, struct page *page); - -int nfs_readpage_async(void *data, struct inode *inode, - struct page *page) -{ - struct nfs_readdesc *desc = data; - struct nfs_pgio_mirror *pgm; - int error; - - nfs_pageio_init_read(&desc->pgio, inode, false, - &nfs_async_read_completion_ops); - - error = readpage_async_filler(desc, page); - if (error) - goto out; - - nfs_pageio_complete(&desc->pgio); - - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1); - - pgm = &desc->pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - - return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0; - -out: - return error; -} - static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) @@ -170,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in nfs_readpage_async / - * readpage_async_filler */ + * request are zeroed in readpage_async_filler */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -303,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } +static int +readpage_async_filler(void *data, struct page *page) +{ + struct nfs_readdesc *desc = data; + struct nfs_page *new; + unsigned int len; + int error; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + + new = nfs_create_request(desc->ctx, page, 0, len); + if (IS_ERR(new)) + goto out_error; + + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); + if (!nfs_pageio_add_request(&desc->pgio, new)) { + nfs_list_remove_request(new); + error = desc->pgio.pg_error; + nfs_readpage_release(new, error); + goto out; + } + return 0; +out_error: + error = PTR_ERR(new); + unlock_page(page); +out: + return error; +} + /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -351,13 +370,20 @@ int nfs_readpage(struct file *file, struct page *page) } xchg(&desc.ctx->error, 0); - ret = nfs_readpage_async(&desc, inode, page); + nfs_pageio_init_read(&desc.pgio, inode, false, + &nfs_async_read_completion_ops); + + ret = readpage_async_filler(&desc, page); + + if (!ret) + nfs_pageio_complete_read(&desc.pgio, inode); + + ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) ret = xchg(&desc.ctx->error, 0); } - nfs_add_stats(inode, NFSIOS_READPAGES, 1); out: put_nfs_open_context(desc.ctx); return ret; @@ -366,45 +392,11 @@ int nfs_readpage(struct file *file, struct page *page) return ret; } -static int -readpage_async_filler(void *data, struct page *page) -{ - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct nfs_page *new; - unsigned int len; - int error; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - - new = nfs_create_request(desc->ctx, page, 0, len); - if (IS_ERR(new)) - goto out_error; - - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(&desc->pgio, new)) { - nfs_list_remove_request(new); - error = desc->pgio.pg_error; - nfs_readpage_release(new, error); - goto out; - } - return 0; -out_error: - error = PTR_ERR(new); - unlock_page(page); -out: - return error; -} - int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct nfs_pgio_mirror *pgm; struct nfs_readdesc desc; struct inode *inode = mapping->host; - unsigned long npages; int ret; dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", @@ -437,16 +429,9 @@ int nfs_readpages(struct file *file, struct address_space *mapping, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&desc.pgio); - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1); + nfs_pageio_complete_read(&desc.pgio, inode); - pgm = &desc.pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> - PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); out: diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index cb0248a34518..3cfcf219e96b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -570,7 +570,6 @@ nfs_have_writebacks(struct inode *inode) extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); -extern int nfs_readpage_async(void *, struct inode *, struct page *); /* * inline functions From 59ebc7fd74506367b109497eeef36034e648c943 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Thu, 24 Dec 2020 21:22:44 +0800 Subject: [PATCH 094/506] ext4: use DEFINE_MUTEX() for mutex lock mutex lock can be initialized automatically with DEFINE_MUTEX() rather than explicitly calling mutex_init(). Signed-off-by: Zheng Yongjun Link: https://lore.kernel.org/r/20201224132244.30907-1-zhengyongjun3@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a6f9875aa34..f361f9fb40d1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,7 +59,7 @@ #include static struct ext4_lazy_init *ext4_li_info; -static struct mutex ext4_li_mtx; +static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -6667,7 +6667,6 @@ static int __init ext4_init_fs(void) ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; - mutex_init(&ext4_li_mtx); /* Build-time check for flags consistency */ ext4_check_flag_values(); From 027f14f5357279655c3ebc6d14daff8368d4f53f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 21 Jan 2021 12:33:20 -0500 Subject: [PATCH 095/506] ext4: don't try to processed freed blocks until mballoc is initialized If we try to make any changes via the journal between when the journal is initialized, but before the multi-block allocated is initialized, we will end up deferencing a NULL pointer when the journal commit callback function calls ext4_process_freed_data(). The proximate cause of this failure was commit 2d01ddc86606 ("ext4: save error info to sb through journal if available") since file system corruption problems detected before the call to ext4_mb_init() would result in a journal commit before we aborted the mount of the file system.... and we would then trigger the NULL pointer deref. Link: https://lore.kernel.org/r/YAm8qH/0oo2ofSMR@mit.edu Reported-by: Murphy Zhou Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f361f9fb40d1..071d131fadd8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4875,7 +4875,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = @@ -4987,6 +4986,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount5; } + /* + * We can only set up the journal commit callback once + * mballoc is initialized + */ + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = + ext4_journal_commit_callback; + block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); From 96e7c02d0bbca9f33fb4175e5f302dab1de6f985 Mon Sep 17 00:00:00 2001 From: Daejun Park Date: Mon, 11 Jan 2021 10:37:26 +0900 Subject: [PATCH 096/506] ext4: Change list_for_each* to list_for_each_entry* In the fast_commit.c, list_for_each* + list_entry can be changed to list_for_each_entry*. It reduces number of variables and lines. Signed-off-by: Daejun Park Link: https://lore.kernel.org/r/20210111013726epcms2p4579ae56040d7043db785bf0d0a785dc7@epcms2p4 Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0a14a7c87bf8..619412134bbf 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -915,13 +915,11 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; - struct list_head *pos; int ret = 0; spin_lock(&sbi->s_fc_lock); ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); @@ -978,17 +976,15 @@ __releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_dentry_update *fc_dentry; + struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; - struct list_head *pos, *n, *fcd_pos, *fcd_n; - struct ext4_inode_info *ei; + struct ext4_inode_info *ei, *ei_n; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; - list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { - fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, - fcd_list); + list_for_each_entry_safe(fc_dentry, fc_dentry_n, + &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv( @@ -1004,8 +1000,8 @@ __releases(&sbi->s_fc_lock) } inode = NULL; - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { inode = &ei->vfs_inode; break; @@ -1057,7 +1053,6 @@ static int ext4_fc_perform_commit(journal_t *journal) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; - struct list_head *pos; struct inode *inode; struct blk_plug plug; int ret = 0; @@ -1099,8 +1094,7 @@ static int ext4_fc_perform_commit(journal_t *journal) goto out; } - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; @@ -1226,9 +1220,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter; + struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; - struct list_head *pos, *n; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; @@ -1236,8 +1229,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); From c6c818e50d1982ebac4dcd3ae18c1b49a66ebacb Mon Sep 17 00:00:00 2001 From: Vinicius Tinti Date: Tue, 2 Feb 2021 16:28:37 +0000 Subject: [PATCH 097/506] ext4: factor out htree rep invariant check This patch moves some debugging code which is used to validate the hash tree node when doing a binary search of an htree node into a separate function, which is disabled by default (since it is only used by developers when they are modifying the htree code paths). In addition to cleaning up the code to make it more maintainable, it silences a Clang compiler warning when -Wunreachable-code-aggressive is enabled. (There is no plan to enable this warning by default, since there it has far too many false positives; nevertheless, this commit reduces one of the many false positives by one.) Signed-off-by: Vinicius Tinti Link: https://lore.kernel.org/r/20210202162837.129631-1-viniciustinti@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cf652ba3e74d..a6e28b4b5a95 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -731,6 +731,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } + +/* + * Linear search cross check + */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ + while (n--) { + dxtrace(printk(KERN_CONT ",")); + if (dx_get_hash(++at) > hash) { + at--; + break; + } + } + ASSERT(at == target - 1); +} +#else /* DX_DEBUG */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ +} #endif /* DX_DEBUG */ /* @@ -827,20 +850,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, p = m + 1; } - if (0) { // linear search cross check - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(KERN_CONT ",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - ASSERT(at == p - 1); - } + htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", From 6c3384d8f408548df74d27bfa0c95c47988f1480 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 30 Nov 2020 09:19:28 +0800 Subject: [PATCH 098/506] leds: lm3533: Switch to using the new API kobj_to_dev() fixed the following coccicheck: drivers/leds/leds-lm3533.c:611:60-61: WARNING opportunity for kobj_to_dev(). Signed-off-by: Tian Tao Signed-off-by: Pavel Machek --- drivers/leds/leds-lm3533.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-lm3533.c b/drivers/leds/leds-lm3533.c index b3edee703193..97911669a199 100644 --- a/drivers/leds/leds-lm3533.c +++ b/drivers/leds/leds-lm3533.c @@ -608,7 +608,7 @@ static struct attribute *lm3533_led_attributes[] = { static umode_t lm3533_led_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct led_classdev *led_cdev = dev_get_drvdata(dev); struct lm3533_led *led = to_lm3533_led(led_cdev); umode_t mode = attr->mode; From d86464527681b72941939258ef68904cae6f8136 Mon Sep 17 00:00:00 2001 From: Dylan Van Assche Date: Tue, 29 Dec 2020 19:15:12 +0100 Subject: [PATCH 099/506] leds: gpio: Set max brightness to 1 GPIO LEDs only know 2 states: ON or OFF and do not have PWM capabilities. However, the max brightness is reported as 255. This patch sets the max brightness value of a GPIO controlled LED to 1. Tested on my PinePhone 1.2. Signed-off-by: Dylan Van Assche Signed-off-by: Pavel Machek --- drivers/leds/leds-gpio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 93f5b1b60fde..b5d5e22d2d1e 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -96,7 +96,8 @@ static int create_gpio_led(const struct gpio_led *template, } else { state = (template->default_state == LEDS_GPIO_DEFSTATE_ON); } - led_dat->cdev.brightness = state ? LED_FULL : LED_OFF; + led_dat->cdev.brightness = state; + led_dat->cdev.max_brightness = 1; if (!template->retain_state_suspended) led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME; if (template->panic_indicator) From b5776e7524afbd4569978ff790864755c438bba7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Feb 2021 00:05:20 -0500 Subject: [PATCH 100/506] ext4: fix potential htree index checksum corruption In the case where we need to do an interior node split, and immediately afterwards, we are unable to allocate a new directory leaf block due to ENOSPC, the directory index checksum's will not be filled in correctly (and indeed, will not be correctly journalled). This looks like a bug that was introduced when we added largedir support. The original code doesn't make any sense (and should have been caught in code review), but it was hidden because most of the time, the index node checksum will be set by do_split(). But if do_split bails out due to ENOSPC, then ext4_handle_dirty_dx_node() won't get called, and so the directory index checksum field will not get set, leading to: EXT4-fs error (device sdb): dx_probe:858: inode #6635543: block 4022: comm nfsd: Directory index failed checksum Google-Bug-Id: 176345532 Fixes: e08ac99fa2a2 ("ext4: add largedir feature") Cc: Artem Blagodarenko Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a6e28b4b5a95..115762180801 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2411,11 +2411,10 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, (frame - 1)->bh); if (err) goto journal_error; - if (restart) { - err = ext4_handle_dirty_dx_node(handle, dir, - frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + if (err) goto journal_error; - } } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, From 5ca6d0268df8d7a4e1e453bc8a1ee3459fde12a2 Mon Sep 17 00:00:00 2001 From: Bernard Zhao Date: Tue, 2 Feb 2021 04:32:03 -0800 Subject: [PATCH 101/506] drm/msm: remove unneeded variable: "rc" remove unneeded variable: "rc". Signed-off-by: Bernard Zhao Reviewed-by: Guenter Roeck Reviewed-by: Stephen Boyd Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_panel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_panel.c b/drivers/gpu/drm/msm/dp/dp_panel.c index d1780bcac8cc..9cc816663668 100644 --- a/drivers/gpu/drm/msm/dp/dp_panel.c +++ b/drivers/gpu/drm/msm/dp/dp_panel.c @@ -409,7 +409,6 @@ int dp_panel_timing_cfg(struct dp_panel *dp_panel) int dp_panel_init_panel_info(struct dp_panel *dp_panel) { - int rc = 0; struct drm_display_mode *drm_mode; drm_mode = &dp_panel->dp_mode.drm_mode; @@ -436,7 +435,7 @@ int dp_panel_init_panel_info(struct dp_panel *dp_panel) min_t(u32, dp_panel->dp_mode.bpp, 30)); DRM_DEBUG_DP("updated bpp = %d\n", dp_panel->dp_mode.bpp); - return rc; + return 0; } struct dp_panel *dp_panel_get(struct dp_panel_in *in) From 9929f4adce3679b08262b9193fbe0e04d65abb11 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:01 -0500 Subject: [PATCH 102/506] xprtrdma: Remove FMR support in rpcrdma_convert_iovs() Support for FMR was removed by commit ba69cd122ece ("xprtrdma: Remove support for FMR memory registration") [Dec 2018]. That means the buffer-splitting behavior of rpcrdma_convert_kvec(), added by commit 821c791a0bde ("xprtrdma: Segment head and tail XDR buffers on page boundaries") [Mar 2016], is no longer necessary. FRWR memory registration handles this case with aplomb. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8f5d0cb68360..57f9217048d8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) return 0; } -/* Split @vec on page boundaries into SGEs. FMR registers pages, not - * a byte range. Other modes coalesce these SGEs into a single MR - * when they can. +/* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. @@ -215,22 +213,11 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - u32 remaining, page_offset; - char *base; - - base = vec->iov_base; - page_offset = offset_in_page(base); - remaining = vec->iov_len; - while (remaining) { - seg->mr_page = NULL; - seg->mr_offset = base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg->mr_len; - base += seg->mr_len; - ++seg; - ++(*n); - page_offset = 0; - } + seg->mr_page = NULL; + seg->mr_offset = vec->iov_base; + seg->mr_len = vec->iov_len; + ++seg; + ++(*n); return seg; } @@ -283,7 +270,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, goto out; if (xdrbuf->tail[0].iov_len) - seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); out: if (unlikely(n > RPCRDMA_MAX_SEGS)) From 54e6aec57c251e36a4c3153ccfee9538f2ec1bcf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:07 -0500 Subject: [PATCH 103/506] xprtrdma: Simplify rpcrdma_convert_kvec() and frwr_map() Clean up. Remove a conditional branch from the SGL set-up loop in frwr_map(): Instead of using either sg_set_page() or sg_set_buf(), initialize the mr_page field properly when rpcrdma_convert_kvec() converts the kvec to an SGL entry. frwr_map() can then invoke sg_set_page() unconditionally. Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 9 +++++---- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index baca49fe83af..13a50f77dddb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -306,14 +306,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, if (nsegs > ep->re_max_fr_depth) nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); + sg_set_page(&mr->mr_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); ++seg; ++i; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 57f9217048d8..b36b9aae0588 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -213,7 +213,7 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - seg->mr_page = NULL; + seg->mr_page = virt_to_page(vec->iov_base); seg->mr_offset = vec->iov_base; seg->mr_len = vec->iov_len; ++seg; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 94b28657aeeb..02971e183989 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,10 +283,11 @@ enum { RPCRDMA_MAX_IOV_SEGS, }; -struct rpcrdma_mr_seg { /* chunk descriptors */ - u32 mr_len; /* length of chunk or segment */ - struct page *mr_page; /* owning page, if any */ - char *mr_offset; /* kva if no page, else offset */ +/* Arguments for DMA mapping and registration */ +struct rpcrdma_mr_seg { + u32 mr_len; /* length of segment */ + struct page *mr_page; /* underlying struct page */ + char *mr_offset; /* IN: page offset, OUT: iova */ }; /* The Send SGE array is provisioned to send a maximum size From 67b16625d17b7c2e5babbbe817e12ad37c3fe3f4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:13 -0500 Subject: [PATCH 104/506] xprtrdma: Refactor invocations of offset_in_page() Clean up so that offset_in_page() is invoked less often in the most common case, which is mapping xdr->pages. Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 8 +++----- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 13a50f77dddb..766a1048a48a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -306,16 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, if (nsegs > ep->re_max_fr_depth) nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); + sg_set_page(&mr->mr_sg[i], seg->mr_page, + seg->mr_len, seg->mr_offset); ++seg; ++i; if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; - if ((i < nsegs && offset_in_page(seg->mr_offset)) || + if ((i < nsegs && seg->mr_offset) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b36b9aae0588..b2482d9328d1 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -214,7 +214,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { seg->mr_page = virt_to_page(vec->iov_base); - seg->mr_offset = vec->iov_base; + seg->mr_offset = offset_in_page(vec->iov_base); seg->mr_len = vec->iov_len; ++seg; ++(*n); @@ -246,7 +246,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = offset_in_page(xdrbuf->page_base); while (len) { seg->mr_page = *ppages; - seg->mr_offset = (char *)page_base; + seg->mr_offset = page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); len -= seg->mr_len; ++ppages; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 02971e183989..ed1c5444fb9d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -287,7 +287,7 @@ enum { struct rpcrdma_mr_seg { u32 mr_len; /* length of segment */ struct page *mr_page; /* underlying struct page */ - char *mr_offset; /* IN: page offset, OUT: iova */ + u64 mr_offset; /* IN: page offset, OUT: iova */ }; /* The Send SGE array is provisioned to send a maximum size From 84dff5eb86ceb2a356d5409fc1af21a366bf3c35 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:19 -0500 Subject: [PATCH 105/506] rpcrdma: Fix comments about reverse-direction operation During the final stages of publication of RFC 8167, reviewers requested that we use the term "reverse direction" rather than "backwards direction". Update comments to reflect this preference. Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 ++-- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +----- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 946edf2db646..a249837d6a55 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2020, Oracle and/or its affiliates. * - * Support for backward direction RPCs on RPC/RDMA. + * Support for reverse-direction RPCs on RPC/RDMA. */ #include @@ -208,7 +208,7 @@ static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) } /** - * rpcrdma_bc_receive_call - Handle a backward direction call + * rpcrdma_bc_receive_call - Handle a reverse-direction Call * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b2482d9328d1..283e9bfcab44 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1151,14 +1151,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - goto out_short; + return true; rpcrdma_bc_receive_call(r_xprt, rep); return true; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 63f8be974df2..4a1edbb4028e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2018 Oracle. All rights reserved. * - * Support for backward direction RPCs on RPC/RDMA (server-side). + * Support for reverse-direction RPCs on RPC/RDMA (server-side). */ #include @@ -59,7 +59,7 @@ void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, spin_unlock(&xprt->queue_lock); } -/* Send a backwards direction RPC call. +/* Send a reverse-direction RPC Call. * * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ed1c5444fb9d..fe3be985e239 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -98,9 +98,9 @@ struct rpcrdma_ep { atomic_t re_completion_ids; }; -/* Pre-allocate extra Work Requests for handling backward receives - * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up, long before the +/* Pre-allocate extra Work Requests for handling reverse-direction + * Receives and Sends. This is a fixed value because the Work Queues + * are allocated when the forward channel is set up, long before the * backchannel is provisioned. This value is two times * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ From 2324fbedc207568b11354b146c3fc4ff2b89d588 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:25 -0500 Subject: [PATCH 106/506] xprtrdma: Pad optimization, revisited The NetApp Linux team discovered that with NFS/RDMA servers that do not support RFC 8797, the Linux client is forming NFSv4.x WRITE requests incorrectly. In this case, the Linux NFS client disables implicit chunk round-up for odd-length Read and Write chunks. The goal was to support old servers that needed that padding to be sent explicitly by clients. In that case the Linux NFS included the tail kvec in the Read chunk, since the tail contains any needed padding. That meant a separate memory registration is needed for the tail kvec, adding to the cost of forming such requests. To avoid that cost for a mere 3 bytes of zeroes that are always ignored by receivers, we try to use implicit roundup when possible. For NFSv4.x, the tail kvec also sometimes contains a trailing GETATTR operation. The Linux NFS client unintentionally includes that GETATTR operation in the Read chunk as well as inline. The fix is simply to /never/ include the tail kvec when forming a data payload Read chunk. The padding is thus now always present. Note that since commit 9ed5af268e88 ("SUNRPC: Clean up the handling of page padding in rpc_prepare_reply_pages()") [Dec 2020] the NFS client passes payload data to the transport with the padding in xdr->pages instead of in the send buffer's tail kvec. So now the Linux NFS client appends XDR padding to all odd-sized Read chunks. This shouldn't be a problem because: - RFC 8166-compliant servers are supposed to work with or without that XDR padding in Read chunks. - Since the padding is now in the same memory region as the data payload, a separate memory registration is not needed. In addition, the link layer extends data in RDMA Read responses to 4-byte boundaries anyway. Thus there is now no savings when the padding is not included. Because older kernels include the payload's XDR padding in the tail kvec, a fix there will be more complicated. Thus backporting this patch is not recommended. Reported by: Olga Kornievskaia Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 283e9bfcab44..1c3e377272e0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -255,10 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - /* When encoding a Read chunk, the tail iovec contains an - * XDR pad and may be omitted. - */ - if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) + if (type == rpcrdma_readch) goto out; /* When encoding a Write chunk, some servers need to see an From c30f259a213802d5f4440fbb02e73b3b03403049 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 Feb 2021 11:59:32 -0500 Subject: [PATCH 107/506] rpcrdma: Capture bytes received in Receive completion tracepoints Make it easier to spot messages of an unusual size. Signed-off-by: Chuck Lever Acked-by: Tom Talpey Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 50 ++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 76e85e16854b..c838e7ac1c2d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) +DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class, + TP_PROTO( + const struct ib_wc *wc, + const struct rpc_rdma_cid *cid + ), + + TP_ARGS(wc, cid), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(u32, received) + __field(unsigned long, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) { + __entry->received = 0; + __entry->vendor_err = wc->vendor_err; + } else { + __entry->received = wc->byte_len; + __entry->vendor_err = 0; + } + ), + + TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err, + __entry->received + ) +); + +#define DEFINE_RECEIVE_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_receive_completion_class, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(wc, cid)) + DECLARE_EVENT_CLASS(xprtrdma_reply_class, TP_PROTO( const struct rpcrdma_rep *rep @@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err, ** Completion events **/ -DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); +DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive); + DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); @@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv, ) ); -DEFINE_COMPLETION_EVENT(svcrdma_wc_receive); +DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive); TRACE_EVENT(svcrdma_rq_post_err, TP_PROTO( From 7df222c35920569c7f31d177a7249c70139008f1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 4 Feb 2021 00:15:36 +0100 Subject: [PATCH 108/506] drm/msm/disp/mdp5: mdp5_cfg: Fix msm8974v2 max_clk The maximum mdp clock rate on msm8974v2 is 320MHz. Fix it. Signed-off-by: Konrad Dybcio Reviewed-by: Abhinav Kumar Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c index df10c1ac7591..94ce62a26daf 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c @@ -177,7 +177,7 @@ static const struct mdp5_cfg_hw msm8x74v2_config = { [3] = INTF_HDMI, }, }, - .max_clk = 200000000, + .max_clk = 320000000, }; static const struct mdp5_cfg_hw apq8084_config = { From c8d99bb938d3303989c4988caf090084073e85a2 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 4 Feb 2021 14:53:11 -0800 Subject: [PATCH 109/506] drm/msm: Fix legacy relocs path In moving code around, we ended up using the same pointer to copy_from_user() the relocs tables as we used for the cmd table entry, which is clearly not right. This went unnoticed because modern mesa on non-ancent kernels does not actually use relocs. But this broke ancient mesa on modern kernels. Reported-by: Emil Velikov Fixes: 20224d715a88 ("drm/msm/submit: Move copy_from_user ahead of locking bos") Signed-off-by: Rob Clark Reviewed-by: Akhil P Oommen --- drivers/gpu/drm/msm/msm_gem_submit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index d04c349d8112..5480852bdeda 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -198,6 +198,8 @@ static int submit_lookup_cmds(struct msm_gem_submit *submit, submit->cmd[i].idx = submit_cmd.submit_idx; submit->cmd[i].nr_relocs = submit_cmd.nr_relocs; + userptr = u64_to_user_ptr(submit_cmd.relocs); + sz = array_size(submit_cmd.nr_relocs, sizeof(struct drm_msm_gem_submit_reloc)); /* check for overflow: */ From 586a0787ce35f2e32d399b7e344e634e07de2997 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 5 Feb 2021 11:31:34 -0500 Subject: [PATCH 110/506] xprtrdma: Clean up rpcrdma_prepare_readch() Since commit 9ed5af268e88 ("SUNRPC: Clean up the handling of page padding in rpc_prepare_reply_pages()") [Dec 2020] the NFS client passes payload data to the transport with the padding in xdr->pages instead of in the send buffer's tail kvec. There's no need for the extra logic to advance the base of the tail kvec because the upper layer no longer places XDR padding there. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1c3e377272e0..292f066d006e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -628,9 +628,8 @@ static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, return false; } -/* The tail iovec may include an XDR pad for the page list, - * as well as additional content, and may not reside in the - * same page as the head iovec. +/* The tail iovec might not reside in the same page as the + * head iovec. */ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, struct xdr_buf *xdr, @@ -748,27 +747,19 @@ static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { + struct kvec *tail = &xdr->tail[0]; + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) return false; - /* If there is a Read chunk, the page list is being handled + /* If there is a Read chunk, the page list is handled * via explicit RDMA, and thus is skipped here. */ - /* Do not include the tail if it is only an XDR pad */ - if (xdr->tail[0].iov_len > 3) { - unsigned int page_base, len; - - /* If the content in the page list is an odd length, - * xdr_write_pages() adds a pad at the beginning of - * the tail iovec. Force the tail's non-pad content to - * land at the next XDR position in the Send message. - */ - page_base = offset_in_page(xdr->tail[0].iov_base); - len = xdr->tail[0].iov_len; - page_base += len & 3; - len -= len & 3; - if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) + if (tail->iov_len) { + if (!rpcrdma_prepare_tail_iov(req, xdr, + offset_in_page(tail->iov_base), + tail->iov_len)) return false; kref_get(&req->rl_kref); } From ea9f337ce81e315ef5643b7c843d6d8795461a5b Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Fri, 5 Feb 2021 12:44:38 -0800 Subject: [PATCH 111/506] drm/msm/dp: reset dp controller only at boot up and pm_resume DP_SW_RESET is the global SW reset that is used to initialize DP controller. If DP_SW_RESET executed during connection setup, two HPD related side effects may occurred, 1) pending HPD interrupts cleared unexpected 2) re start debounce logic which trigger another interrupt This patch only issue DP_SW_RESET at boot up and pm_resume. This patch also reinit video_comp before configure dp controller to avoid missing VIDEO_READY interrupt. Fixes: 9fc418430c65 ("drm/msm/dp: unplug interrupt missed after irq_hpd handler") Signed-off-by: Kuogee Hsieh Reviewed-by: Stephen Boyd Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_ctrl.c | 22 +++++++++------------- drivers/gpu/drm/msm/dp/dp_ctrl.h | 2 +- drivers/gpu/drm/msm/dp/dp_display.c | 14 +++++++------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index 55b7d0edffbf..f8e75e25d3cc 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -1296,8 +1296,6 @@ static int dp_ctrl_setup_main_link(struct dp_ctrl_private *ctrl, * transitioned to PUSH_IDLE. In order to start transmitting * a link training pattern, we have to first do soft reset. */ - if (*training_step == DP_TRAINING_1) - dp_catalog_ctrl_reset(ctrl->catalog); ret = dp_ctrl_link_train(ctrl, cr, training_step); @@ -1366,7 +1364,7 @@ static int dp_ctrl_enable_stream_clocks(struct dp_ctrl_private *ctrl) return ret; } -int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip) +int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip, bool reset) { struct dp_ctrl_private *ctrl; struct dp_io *dp_io; @@ -1383,6 +1381,9 @@ int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip) ctrl->dp_ctrl.orientation = flip; + if (reset) + dp_catalog_ctrl_reset(ctrl->catalog); + dp_catalog_ctrl_phy_reset(ctrl->catalog); phy_init(phy); dp_catalog_ctrl_enable_irq(ctrl->catalog, true); @@ -1492,18 +1493,14 @@ static int dp_ctrl_deinitialize_mainlink(struct dp_ctrl_private *ctrl) return 0; } -static void dp_ctrl_link_idle_reset(struct dp_ctrl_private *ctrl) -{ - dp_ctrl_push_idle(&ctrl->dp_ctrl); - dp_catalog_ctrl_reset(ctrl->catalog); -} - static int dp_ctrl_link_maintenance(struct dp_ctrl_private *ctrl) { int ret = 0; struct dp_cr_status cr; int training_step = DP_TRAINING_NONE; + dp_ctrl_push_idle(&ctrl->dp_ctrl); + ctrl->dp_ctrl.pixel_rate = ctrl->panel->dp_mode.drm_mode.clock; ret = dp_ctrl_setup_main_link(ctrl, &cr, &training_step); @@ -1630,7 +1627,6 @@ void dp_ctrl_handle_sink_request(struct dp_ctrl *dp_ctrl) if (sink_request & DP_TEST_LINK_TRAINING) { dp_link_send_test_response(ctrl->link); - dp_ctrl_link_idle_reset(ctrl); if (dp_ctrl_link_maintenance(ctrl)) { DRM_ERROR("LM failed: TEST_LINK_TRAINING\n"); return; @@ -1684,7 +1680,7 @@ int dp_ctrl_on_link(struct dp_ctrl *dp_ctrl) break; } - training_step = DP_TRAINING_1; + training_step = DP_TRAINING_NONE; rc = dp_ctrl_setup_main_link(ctrl, &cr, &training_step); if (rc == 0) { /* training completed successfully */ @@ -1792,14 +1788,14 @@ int dp_ctrl_on_stream(struct dp_ctrl *dp_ctrl) * Set up transfer unit values and set controller state to send * video. */ + reinit_completion(&ctrl->video_comp); + dp_ctrl_configure_source_params(ctrl); dp_catalog_ctrl_config_msa(ctrl->catalog, ctrl->link->link_params.rate, ctrl->dp_ctrl.pixel_rate, dp_ctrl_use_fixed_nvid(ctrl)); - reinit_completion(&ctrl->video_comp); - dp_ctrl_setup_tr_unit(ctrl); dp_catalog_ctrl_state_ctrl(ctrl->catalog, DP_STATE_CTRL_SEND_VIDEO); diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.h b/drivers/gpu/drm/msm/dp/dp_ctrl.h index f60ba93c8678..a836bd358447 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.h +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.h @@ -19,7 +19,7 @@ struct dp_ctrl { u32 pixel_rate; }; -int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip); +int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip, bool reset); void dp_ctrl_host_deinit(struct dp_ctrl *dp_ctrl); int dp_ctrl_on_link(struct dp_ctrl *dp_ctrl); int dp_ctrl_on_stream(struct dp_ctrl *dp_ctrl); diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index d9216f80cdad..5a39da6e1eaf 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -350,7 +350,7 @@ static int dp_display_process_hpd_high(struct dp_display_private *dp) return rc; } -static void dp_display_host_init(struct dp_display_private *dp) +static void dp_display_host_init(struct dp_display_private *dp, int reset) { bool flip = false; @@ -365,7 +365,7 @@ static void dp_display_host_init(struct dp_display_private *dp) dp_display_set_encoder_mode(dp); dp_power_init(dp->power, flip); - dp_ctrl_host_init(dp->ctrl, flip); + dp_ctrl_host_init(dp->ctrl, flip, reset); dp_aux_init(dp->aux); dp->core_initialized = true; } @@ -403,7 +403,7 @@ static int dp_display_usbpd_configure_cb(struct device *dev) goto end; } - dp_display_host_init(dp); + dp_display_host_init(dp, false); /* * set sink to normal operation mode -- D0 @@ -700,7 +700,7 @@ static int dp_irq_hpd_handle(struct dp_display_private *dp, u32 data) return 0; } - if (state == ST_CONNECT_PENDING) { + if (state == ST_CONNECT_PENDING || state == ST_DISCONNECT_PENDING) { /* wait until ST_CONNECTED */ dp_add_event(dp, EV_IRQ_HPD_INT, 0, 1); /* delay = 1 */ mutex_unlock(&dp->event_mutex); @@ -1012,7 +1012,7 @@ int dp_display_get_test_bpp(struct msm_dp *dp) static void dp_display_config_hpd(struct dp_display_private *dp) { - dp_display_host_init(dp); + dp_display_host_init(dp, true); dp_catalog_ctrl_hpd_config(dp->catalog); /* Enable interrupt first time @@ -1266,7 +1266,7 @@ static int dp_pm_resume(struct device *dev) dp->hpd_state = ST_DISCONNECTED; /* turn on dp ctrl/phy */ - dp_display_host_init(dp); + dp_display_host_init(dp, true); dp_catalog_ctrl_hpd_config(dp->catalog); @@ -1449,7 +1449,7 @@ int msm_dp_display_enable(struct msm_dp *dp, struct drm_encoder *encoder) state = dp_display->hpd_state; if (state == ST_DISPLAY_OFF) - dp_display_host_init(dp_display); + dp_display_host_init(dp_display, true); dp_display_enable(dp_display, 0); From 182b4a2d251305201b6f9cae29067f7112f05835 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 6 Feb 2021 20:18:58 -0800 Subject: [PATCH 112/506] drm/msm/dp: Add a missing semi-colon A missing semicolon here causes my external display to stop working. Indeed, missing the semicolon on the return statement leads to dp_panel_update_tu_timings() not existing because the compiler thinks it's part of the return statement of a void function, so it must not be important. $ ./scripts/bloat-o-meter before.o after.o add/remove: 1/1 grow/shrink: 0/1 up/down: 7400/-7540 (-140) Function old new delta dp_panel_update_tu_timings - 7400 +7400 _dp_ctrl_calc_tu.constprop 18024 17900 -124 dp_panel_update_tu_timings.constprop 7416 - -7416 Total: Before=54440, After=54300, chg -0.26% Add a semicolon so this function works like it used to. Cc: Sean Paul Cc: Kuogee Hsieh Cc: linux-arm-msm@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: freedreno@lists.freedesktop.org Cc: Lee Jones Fixes: cc9014bf63a4 ("drm/msm/dp/dp_ctrl: Move 'tu' from the stack to the heap") Signed-off-by: Stephen Boyd Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index f8e75e25d3cc..8d59eb939975 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -631,7 +631,7 @@ static void _dp_ctrl_calc_tu(struct dp_tu_calc_input *in, tu = kzalloc(sizeof(*tu), GFP_KERNEL); if (!tu) - return + return; dp_panel_update_tu_timings(in, tu); From c1fb1bf64bb63a1d6ae3311a9a3581a527c1f185 Mon Sep 17 00:00:00 2001 From: Defang Bo Date: Mon, 28 Dec 2020 10:07:45 +0800 Subject: [PATCH 113/506] m68k: let clk_enable() return immediately if clk is NULL Similar to commit<742859adc721>("m68k: let clk_disable() return immediately if clk is NULL"). there should be a check for clk to prevent NULL pointer dereference. Signed-off-by: Defang Bo Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/clk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/m68k/coldfire/clk.c b/arch/m68k/coldfire/clk.c index 7bc666e482eb..076a9caa9557 100644 --- a/arch/m68k/coldfire/clk.c +++ b/arch/m68k/coldfire/clk.c @@ -90,6 +90,10 @@ EXPORT_SYMBOL(clk_get); int clk_enable(struct clk *clk) { unsigned long flags; + + if (!clk) + return -EINVAL; + spin_lock_irqsave(&clk_lock, flags); if ((clk->enabled++ == 0) && clk->clk_ops) clk->clk_ops->enable(clk); From 45901a231723a5a513ff08477983f3a274a6a910 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2021 08:49:32 -0500 Subject: [PATCH 114/506] NFSv4: Fixes for nfs4_bitmask_adjust() We don't want to ask for the ACL in a WRITE reply, since we don't have a preallocated buffer. Instead of checking NFS_INO_INVALID_ACCESS, which is really about managing the access cache, we should look at the value of NFS_INO_INVALID_OTHER. Also ensure we assign the mode, owner and owner_group flags to the correct bit mask. Finally, fix up the check for NFS_INO_INVALID_CTIME to retrieve the ctime, and add a check for NFS_INO_INVALID_CHANGE. Fixes: 76bd5c016ef4 ("NFSv4: make cache consistency bitmask dynamic") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 34570a7785fa..8eb9c716010f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5435,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; - if (cache_validity & NFS_INO_INVALID_ACCESS) - bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP; - if (cache_validity & NFS_INO_INVALID_ACL) - bitmask[0] |= FATTR4_WORD0_ACL; - if (cache_validity & NFS_INO_INVALID_LABEL) + if (cache_validity & NFS_INO_INVALID_OTHER) + bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP | + FATTR4_WORD1_NUMLINKS; + if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; - if (cache_validity & NFS_INO_INVALID_CTIME) + if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_SIZE) From 37eaeed1a57e92d9db200ba7b4851a09c55eef5a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2021 08:55:45 -0500 Subject: [PATCH 115/506] NFS: Fix documenting comment for nfs_revalidate_file_size() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63940a7a70be..d02a63af9c15 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp) EXPORT_SYMBOL_GPL(nfs_file_release); /** - * nfs_revalidate_size - Revalidate the file size + * nfs_revalidate_file_size - Revalidate the file size * @inode: pointer to inode struct * @filp: pointer to struct file * From fc9dc401899ab280fe1849a0ca5800384726a793 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2021 08:55:46 -0500 Subject: [PATCH 116/506] NFS: Optimise sparse writes past the end of file If we're doing a write, and the entire page lies beyond the end-of-file, then we can assume the write can be extended to cover the beginning of the page, since we know the data in that region will be all zeros. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 4 +--- fs/nfs/write.c | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d02a63af9c15..02795a01c7ef 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -626,13 +626,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_flags & IOCB_APPEND) { + if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) goto out; } - if (iocb->ki_pos > i_size_read(inode)) - nfs_revalidate_mapping(inode, file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 639c34fec04a..6193350356a8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1278,19 +1278,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, + unsigned int pagelen) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (nfsi->cache_validity & + (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); - if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) return false; out: - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; return PageUptodate(page) != 0; } @@ -1310,7 +1312,8 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +static int nfs_can_extend_write(struct file *file, struct page *page, + struct inode *inode, unsigned int pagelen) { int ret; struct file_lock_context *flctx = inode->i_flctx; @@ -1318,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode)) + if (!nfs_write_pageuptodate(page, inode, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1356,6 +1359,7 @@ int nfs_updatepage(struct file *file, struct page *page, struct nfs_open_context *ctx = nfs_file_open_context(file); struct address_space *mapping = page_file_mapping(page); struct inode *inode = mapping->host; + unsigned int pagelen = nfs_page_length(page); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1366,8 +1370,8 @@ int nfs_updatepage(struct file *file, struct page *page, if (!count) goto out; - if (nfs_can_extend_write(file, page, inode)) { - count = max(count + offset, nfs_page_length(page)); + if (nfs_can_extend_write(file, page, inode, pagelen)) { + count = max(count + offset, pagelen); offset = 0; } From 28aa2f9e73e762dbaa28fdca20cccb59c74cc139 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2021 08:55:47 -0500 Subject: [PATCH 117/506] NFS: Always clear an invalid mapping when attempting a buffered write If the page cache is invalid, then we can't do read-modify-write, so ensure that we do clear it when we know it is invalid. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 + fs/nfs/inode.c | 113 ++++++++++++++++++++++------------------- include/linux/nfs_fs.h | 1 + 3 files changed, 65 insertions(+), 51 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 02795a01c7ef..03fd1dcc96bd 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -632,6 +632,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } + nfs_clear_invalid_mapping(file->f_mapping); + since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 522aa10a1a3e..2533053d764a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1257,6 +1257,63 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } +/** + * nfs_clear_invalid_mapping - Conditionally clear a mapping + * @mapping: pointer to mapping + * + * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. + */ +int nfs_clear_invalid_mapping(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; + } + + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= + ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); +out: + return ret; +} + bool nfs_mapping_need_revalidate_inode(struct inode *inode) { return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || @@ -1289,65 +1346,19 @@ int nfs_revalidate_mapping_rcu(struct inode *inode) * @inode: pointer to host inode * @mapping: pointer to mapping */ -int nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping) +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long *bitlock = &nfsi->flags; - int ret = 0; - /* swapfiles are not supposed to be shared. */ if (IS_SWAPFILE(inode)) - goto out; + return 0; if (nfs_mapping_need_revalidate_inode(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) - goto out; + return ret; } - /* - * We must clear NFS_INO_INVALID_DATA first to ensure that - * invalidations that come in while we're shooting down the mappings - * are respected. But, that leaves a race window where one revalidator - * can clear the flag, and then another checks it before the mapping - * gets invalidated. Fix that by serializing access to this part of - * the function. - * - * At the same time, we need to allow other tasks to see whether we - * might be in the middle of invalidating the pages, so we only set - * the bit lock here if it looks like we're going to be doing that. - */ - for (;;) { - ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - goto out; - spin_lock(&inode->i_lock); - if (test_bit(NFS_INO_INVALIDATING, bitlock)) { - spin_unlock(&inode->i_lock); - continue; - } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - break; - spin_unlock(&inode->i_lock); - goto out; - } - - set_bit(NFS_INO_INVALIDATING, bitlock); - smp_wmb(); - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| - NFS_INO_DATA_INVAL_DEFER); - spin_unlock(&inode->i_lock); - trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); - trace_nfs_invalidate_mapping_exit(inode, ret); - - clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_INVALIDATING); -out: - return ret; + return nfs_clear_invalid_mapping(mapping); } static bool nfs_file_has_writers(struct nfs_inode *nfsi) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 3cfcf219e96b..2c662857247a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -387,6 +387,7 @@ extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); +extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); From 3258386aba670e3406a499d2d0b7395e14c8d097 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 13 Jan 2021 17:14:03 -0500 Subject: [PATCH 118/506] ext4: reset retry counter when ext4_alloc_file_blocks() makes progress Change the retry policy in ext4_alloc_file_blocks() to allow for a full retry cycle whenever a portion of an allocation request has been fulfilled. A large allocation request often results in multiple calls to ext4_map_blocks(), each of which is potentially subject to a temporary ENOSPC condition and retry cycle. The current code only allows for a single retry cycle. This patch does not address a known bug or reported complaint. However, it should make block allocation for fallocate and zero range more robust. In addition, simplify the conditional controlling the allocation while loop, where testing len alone is sufficient. Remove the assignment to ret2 in the error path after the call to ext4_map_blocks() since its value isn't subsequently used. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20210113221403.18258-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3960b7ec3ab7..77c7c8a54da7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4382,8 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, { struct inode *inode = file_inode(file); handle_t *handle; - int ret = 0; - int ret2 = 0, ret3 = 0; + int ret, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4408,7 +4407,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, depth = ext_depth(inode); retry: - while (ret >= 0 && len) { + while (len) { /* * Recalculate credits when extent tree depth changes. */ @@ -4430,9 +4429,13 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); + ext4_journal_stop(handle); break; } + /* + * allow a full retry cycle for any remaining allocations + */ + retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; @@ -4450,11 +4453,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (unlikely(ret2)) break; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - } return ret > 0 ? ret2 : ret; } From 848fdd62399c638e65a1512616acaa5de7d5c5e8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2021 16:45:49 -0500 Subject: [PATCH 119/506] NFS: Don't set NFS_INO_INVALID_XATTR if there is no xattr cache Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2533053d764a..1575e3e1dda9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); +#ifdef CONFIG_NFS_V4_2 +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return nfsi->xattr_cache != NULL; +} +#else +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return false; +} +#endif + static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); @@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) | NFS_INO_INVALID_XATTR); } + if (!nfs_has_xattr_cache(nfsi)) + flags &= ~NFS_INO_INVALID_XATTR; if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; From 2770ef7c8aeaf28befcbdbe18727e93a42904028 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 29 Aug 2020 14:15:22 +0900 Subject: [PATCH 120/506] ia64: do not typedef struct pal_min_state_area_s Documentation/process/coding-style.rst says: Please don't use things like ``vps_t``. It's a **mistake** to use typedef for structures and pointers. This commit converts as follows: struct pal_min_state_area_s -> struct pal_min_state_area pal_min_state_area_t -> struct pal_min_state_area My main motivation for this is to slim down the include directives of in the next commit. Currently, is required to include directly or indirectly due to (pal_min_state_area_t *). Otherwise, it would have no idea what pal_min_state_area_t is. Replacing it with (struct pal_min_state_area *) will relax the header dependency since it is enough to tell it is a pointer to a structure, and to resolve the size of struct pal_min_state_area. It will make independent of . typedef's a lot of structures, but it is trivial to convert the others in the same way. Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap --- arch/ia64/include/asm/mca.h | 2 +- arch/ia64/include/asm/pal.h | 4 ++-- arch/ia64/include/asm/sal.h | 2 +- arch/ia64/kernel/asm-offsets.c | 18 +++++++++--------- arch/ia64/kernel/mca.c | 4 ++-- arch/ia64/kernel/mca_drv.c | 2 +- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/ia64/include/asm/mca.h b/arch/ia64/include/asm/mca.h index 726df17f1b51..c92b9c15962c 100644 --- a/arch/ia64/include/asm/mca.h +++ b/arch/ia64/include/asm/mca.h @@ -83,7 +83,7 @@ struct ia64_sal_os_state { /* common */ unsigned long sal_ra; /* Return address in SAL, physical */ unsigned long sal_gp; /* GP of the SAL - physical */ - pal_min_state_area_t *pal_min_state; /* from R17. physical in asm, virtual in C */ + struct pal_min_state_area *pal_min_state; /* from R17. physical in asm, virtual in C */ /* Previous values of IA64_KR(CURRENT) and IA64_KR(CURRENT_STACK). * Note: if the MCA/INIT recovery code wants to resume to a new context * then it must change these values to reflect the new kernel stack. diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h index f9d2b3b2dfad..b1d87955e8cc 100644 --- a/arch/ia64/include/asm/pal.h +++ b/arch/ia64/include/asm/pal.h @@ -750,7 +750,7 @@ typedef union pal_mc_error_info_u { * for PAL. */ -typedef struct pal_min_state_area_s { +struct pal_min_state_area { u64 pmsa_nat_bits; /* nat bits for saved GRs */ u64 pmsa_gr[15]; /* GR1 - GR15 */ u64 pmsa_bank0_gr[16]; /* GR16 - GR31 */ @@ -766,7 +766,7 @@ typedef struct pal_min_state_area_s { u64 pmsa_xfs; /* previous ifs */ u64 pmsa_br1; /* branch register 1 */ u64 pmsa_reserved[70]; /* pal_min_state_area should total to 1KB */ -} pal_min_state_area_t; +}; struct ia64_pal_retval { diff --git a/arch/ia64/include/asm/sal.h b/arch/ia64/include/asm/sal.h index 08f5b6aaed73..78f4f7b40435 100644 --- a/arch/ia64/include/asm/sal.h +++ b/arch/ia64/include/asm/sal.h @@ -385,7 +385,7 @@ typedef struct sal_processor_static_info { fr : 1, reserved : 58; } valid; - pal_min_state_area_t min_state_area; + struct pal_min_state_area min_state_area; u64 br[8]; u64 cr[128]; u64 ar[128]; diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index fb0deb8a4221..be3b90fef2e9 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -245,23 +245,23 @@ void foo(void) BLANK(); DEFINE(IA64_PMSA_GR_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_gr)); + offsetof(struct pal_min_state_area, pmsa_gr)); DEFINE(IA64_PMSA_BANK1_GR_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_bank1_gr)); + offsetof(struct pal_min_state_area, pmsa_bank1_gr)); DEFINE(IA64_PMSA_PR_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_pr)); + offsetof(struct pal_min_state_area, pmsa_pr)); DEFINE(IA64_PMSA_BR0_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_br0)); + offsetof(struct pal_min_state_area, pmsa_br0)); DEFINE(IA64_PMSA_RSC_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_rsc)); + offsetof(struct pal_min_state_area, pmsa_rsc)); DEFINE(IA64_PMSA_IIP_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_iip)); + offsetof(struct pal_min_state_area, pmsa_iip)); DEFINE(IA64_PMSA_IPSR_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_ipsr)); + offsetof(struct pal_min_state_area, pmsa_ipsr)); DEFINE(IA64_PMSA_IFS_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_ifs)); + offsetof(struct pal_min_state_area, pmsa_ifs)); DEFINE(IA64_PMSA_XIP_OFFSET, - offsetof (struct pal_min_state_area_s, pmsa_xip)); + offsetof(struct pal_min_state_area, pmsa_xip)); BLANK(); /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 2703f7795672..17151269d655 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -894,7 +894,7 @@ static void finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos, unsigned long *nat) { - const pal_min_state_area_t *ms = sos->pal_min_state; + const struct pal_min_state_area *ms = sos->pal_min_state; const u64 *bank; /* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use @@ -970,7 +970,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, char *p; ia64_va va; extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ - const pal_min_state_area_t *ms = sos->pal_min_state; + const struct pal_min_state_area *ms = sos->pal_min_state; struct task_struct *previous_current; struct pt_regs *old_regs; struct switch_stack *old_sw; diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index 4d0ab323dee8..36a69b4e6169 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -496,7 +496,7 @@ recover_from_read_error(slidx_table_t *slidx, struct ia64_sal_os_state *sos) { u64 target_identifier; - pal_min_state_area_t *pmsa; + struct pal_min_state_area *pmsa; struct ia64_psr *psr1, *psr2; ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; From a5b7c61ee6ad475e2d7dd1e374f45329bd38e687 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 29 Aug 2020 14:15:23 +0900 Subject: [PATCH 121/506] ia64: remove unneeded header includes from includes too many unneeded headers. This commit cuts off a lot of header includes. What we need to include are: - for DECLARE_PER_CPU(u64, ia64_mca_pal_base) - for NR_CPUS - for u8, u64, size_t, etc. - for KERNEL_STACK_SIZE The other header includes are actually unneeded. previously included 436 headers, and now it includes only 138. I confirmed is still self-contained. Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap Acked-by: Ard Biesheuvel --- arch/ia64/include/asm/mca.h | 9 +++------ arch/ia64/kernel/crash.c | 1 + arch/ia64/kernel/efi.c | 1 + arch/ia64/kernel/mca.c | 1 + 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/ia64/include/asm/mca.h b/arch/ia64/include/asm/mca.h index c92b9c15962c..05805249296c 100644 --- a/arch/ia64/include/asm/mca.h +++ b/arch/ia64/include/asm/mca.h @@ -14,13 +14,10 @@ #if !defined(__ASSEMBLY__) -#include +#include +#include #include - -#include -#include -#include -#include +#include #define IA64_MCA_RENDEZ_TIMEOUT (20 * 1000) /* value in milliseconds - 20 seconds */ diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index fec70d662d0c..368113731087 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f932b25fb817..b6bb718ed1ff 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 17151269d655..3911c561d2bb 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include From fa1e160b08e8ceabecbd5b42d8268278197c3e67 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 29 Aug 2020 14:15:24 +0900 Subject: [PATCH 122/506] ia64: remove generated/nr-irqs.h generation to fix build warning Randy reports the following warning when building ARCH=ia64 with CONFIG_IA64_PALINFO=m: ../scripts/Makefile.build:68: 'arch/ia64/kernel/palinfo.ko' will not be built even though obj-m is specified. ../scripts/Makefile.build:69: You cannot use subdir-y/m to visit a module Makefile. Use obj-y/m instead. This message is actually false-positive, and you can get palinfo.ko correctly built. It is emitted in the archprepare stage, where Kbuild descends into arch/ia64/kernel to generate include/generated/nr-irqs.h instead of any kind of kernel objects. arch/ia64/kernel/nr-irqs.c was introduced by commit 213060a4d699 ("[IA64] pvops: paravirtualize NR_IRQS") to pre-calculate: NR_IRQS = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, FOO_NR_IRQS...) Since commit d52eefb47d4e ("ia64/xen: Remove Xen support for ia64"), this union contains just one field, making NR_IRQS and IA64_NATIVE_NR_IRQS always match. So, the following hard-coding now works: #define NR_IRQS IA64_NATIVE_NR_IRQS If you need to re-introduce NR_IRQS = max(...) gimmick in the future, please try to implement it in asm-offsets.c instead of a separate file. It will be possible because the header inclusion has been consolidated to make asm-offsets.c independent of . Reported-by: Randy Dunlap Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap --- arch/ia64/Makefile | 6 ------ arch/ia64/include/asm/irq.h | 4 +++- arch/ia64/kernel/Makefile | 5 ----- arch/ia64/kernel/nr-irqs.c | 22 ---------------------- 4 files changed, 3 insertions(+), 34 deletions(-) delete mode 100644 arch/ia64/kernel/nr-irqs.c diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 45d5368d6a99..b950a23d5219 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -86,9 +86,3 @@ define archhelp echo ' install - Install compressed kernel image' echo '* unwcheck - Check vmlinux for invalid unwind info' endef - -archprepare: make_nr_irqs_h -PHONY += make_nr_irqs_h - -make_nr_irqs_h: - $(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h diff --git a/arch/ia64/include/asm/irq.h b/arch/ia64/include/asm/irq.h index 5acf52e90872..0eccf33dfe8b 100644 --- a/arch/ia64/include/asm/irq.h +++ b/arch/ia64/include/asm/irq.h @@ -14,7 +14,9 @@ #include #include -#include +#include + +#define NR_IRQS IA64_NATIVE_NR_IRQS static __inline__ int irq_canonicalize (int irq) diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c89bd5f8cbf8..78717819131c 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -47,8 +47,3 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. include $(src)/Makefile.gate - -include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s FORCE - $(call filechk,offsets,__ASM_NR_IRQS_H__) - -targets += nr-irqs.s diff --git a/arch/ia64/kernel/nr-irqs.c b/arch/ia64/kernel/nr-irqs.c deleted file mode 100644 index f2633b22d3be..000000000000 --- a/arch/ia64/kernel/nr-irqs.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * calculate - * NR_IRQS = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, FOO_NR_IRQS...) - * depending on config. - * This must be calculated before processing asm-offset.c. - */ - -#define ASM_OFFSETS_C 1 - -#include -#include -#include - -void foo(void) -{ - union paravirt_nr_irqs_max { - char ia64_native_nr_irqs[IA64_NATIVE_NR_IRQS]; - }; - - DEFINE(NR_IRQS, sizeof (union paravirt_nr_irqs_max)); -} From db4632c65eb505410f2e6be9c4d50226c973a129 Mon Sep 17 00:00:00 2001 From: Tor Vic Date: Wed, 16 Dec 2020 13:58:02 +0100 Subject: [PATCH 123/506] Makefile: use smaller dictionary size for xz module compression By default, xz without parameters uses a dictionary size of 8 MB. However, most modules are much smaller than that. The xz manpage states that 'increasing dictionary size usually improves compression ratio, but a dictionary bigger than the uncompressed file is waste of memory'. Use a dictionary size of 2 MB for module compression, resulting in slightly higher compression speed while still maintaining a good compression ratio. Signed-off-by: Tor Vic Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ade44ac4cc2f..f0cfcc7e0886 100644 --- a/Makefile +++ b/Makefile @@ -1053,7 +1053,7 @@ ifdef CONFIG_MODULE_COMPRESS mod_compress_cmd = $(KGZIP) -n -f endif # CONFIG_MODULE_COMPRESS_GZIP ifdef CONFIG_MODULE_COMPRESS_XZ - mod_compress_cmd = $(XZ) -f + mod_compress_cmd = $(XZ) --lzma2=dict=2MiB -f endif # CONFIG_MODULE_COMPRESS_XZ endif # CONFIG_MODULE_COMPRESS export mod_compress_cmd From 83272e6d4765df775e43d5fc4797b4b3fe9a97fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 1 Dec 2020 12:27:48 +0900 Subject: [PATCH 124/506] kbuild: Remove $(cc-option,-gdwarf-4) dependency from DEBUG_INFO_DWARF4 The -gdwarf-4 flag is supported by GCC 4.5+, and also by Clang. You can see it at https://godbolt.org/z/6ed1oW For gcc 4.5.3 pane, line 37: .value 0x4 For clang 10.0.1 pane, line 117: .short 4 Given Documentation/process/changes.rst stating GCC 4.9 is the minimal version, this cc-option is unneeded. Note ---- CONFIG_DEBUG_INFO_DWARF4 controls the DWARF version only for C files. As you can see in the top Makefile, -gdwarf-4 is only passed to CFLAGS. ifdef CONFIG_DEBUG_INFO_DWARF4 DEBUG_CFLAGS += -gdwarf-4 endif This flag is used when compiling *.c files. On the other hand, the assembler is always given -gdwarf-2. KBUILD_AFLAGS += -Wa,-gdwarf-2 Hence, the debug info that comes from *.S files is always DWARF v2. This is simply because GAS supported only -gdwarf-2 for a long time. Recently, GAS gained the support for --gdwarf-[345] options. [1] And, also we have Clang integrated assembler. So, the debug info for *.S files might be improved in the future. In my understanding, the current code is intentional, not a bug. [1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=31bf18645d98b4d3d7357353be840e320649a67d Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Reviewed-by: Nathan Chancellor --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7937265ef879..9cf4d12b81fb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -258,7 +258,6 @@ config DEBUG_INFO_SPLIT config DEBUG_INFO_DWARF4 bool "Generate dwarf4 debuginfo" - depends on $(cc-option,-gdwarf-4) help Generate dwarf4 debug info. This requires recent versions of gcc and gdb. It makes the debug information larger. From 052c805a1851a4415f9e2adfa9654a0b793e0c45 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 13 Dec 2020 01:54:30 +0900 Subject: [PATCH 125/506] kbuild: LD_VERSION redenomination Commit ccbef1674a15 ("Kbuild, lto: add ld-version and ld-ifversion macros") introduced scripts/ld-version.sh for GCC LTO. At that time, this script handled 5 version fields because GCC LTO needed the downstream binutils. (https://lkml.org/lkml/2014/4/8/272) The code snippet from the submitted patch was as follows: # We need HJ Lu's Linux binutils because mainline binutils does not # support mixing assembler and LTO code in the same ld -r object. # XXX check if the gcc plugin ld is the expected one too # XXX some Fedora binutils should also support it. How to check for that? ifeq ($(call ld-ifversion,-ge,22710001,y),y) ... However, GCC LTO was not merged into the mainline after all. (https://lkml.org/lkml/2014/4/8/272) So, the 4th and 5th fields were never used, and finally removed by commit 0d61ed17dd30 ("ld-version: Drop the 4th and 5th version components"). Since then, the last 4-digits returned by this script is always zeros. Remove the meaningless last 4-digits. This makes the version format consistent with GCC_VERSION, CLANG_VERSION, LLD_VERSION. Signed-off-by: Masahiro Yamada Acked-by: Will Deacon Acked-by: Thomas Bogendoerfer --- arch/arm64/Kconfig | 2 +- arch/mips/loongson64/Platform | 2 +- arch/mips/vdso/Kconfig | 2 +- arch/powerpc/Makefile | 2 +- arch/powerpc/lib/Makefile | 2 +- scripts/ld-version.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..bbe72c6aec7f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1477,7 +1477,7 @@ config ARM64_PTR_AUTH depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC # Modern compilers insert a .note.gnu.property section note for PAC # which is only understood by binutils starting with version 2.33.1. - depends on LD_IS_LLD || LD_VERSION >= 233010000 || (CC_IS_GCC && GCC_VERSION < 90100) + depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100) depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) help diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform index ec42c5085905..cc0b9c87f9ad 100644 --- a/arch/mips/loongson64/Platform +++ b/arch/mips/loongson64/Platform @@ -35,7 +35,7 @@ cflags-$(CONFIG_CPU_LOONGSON64) += $(call as-option,-Wa$(comma)-mno-fix-loongson # can't easily be used safely within the kbuild framework. # ifeq ($(call cc-ifversion, -ge, 0409, y), y) - ifeq ($(call ld-ifversion, -ge, 225000000, y), y) + ifeq ($(call ld-ifversion, -ge, 22500, y), y) cflags-$(CONFIG_CPU_LOONGSON64) += \ $(call cc-option,-march=loongson3a -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64) else diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig index 7aec721398d5..a665f6108cb5 100644 --- a/arch/mips/vdso/Kconfig +++ b/arch/mips/vdso/Kconfig @@ -12,7 +12,7 @@ # the lack of relocations. As such, we disable the VDSO for microMIPS builds. config MIPS_LD_CAN_LINK_VDSO - def_bool LD_VERSION >= 225000000 || LD_IS_LLD + def_bool LD_VERSION >= 22500 || LD_IS_LLD config MIPS_DISABLE_VDSO def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 08cf0eade56a..66a814e3b207 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -65,7 +65,7 @@ UTS_MACHINE := $(subst $(space),,$(machine-y)) ifdef CONFIG_PPC32 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o else -ifeq ($(call ld-ifversion, -ge, 225000000, y),y) +ifeq ($(call ld-ifversion, -ge, 22500, y),y) # Have the linker provide sfpr if possible. # There is a corresponding test in arch/powerpc/lib/Makefile KBUILD_LDFLAGS_MODULE += --save-restore-funcs diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 69a91b571845..d4efc182662a 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o # 64-bit linker creates .sfpr on demand for final link (vmlinux), # so it is only needed for modules, and only for older linkers which # do not support --save-restore-funcs -ifeq ($(call ld-ifversion, -lt, 225000000, y),y) +ifeq ($(call ld-ifversion, -lt, 22500, y),y) extra-$(CONFIG_PPC64) += crtsavres.o endif diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh index f2be0ff9a738..0f8a2c0f9502 100755 --- a/scripts/ld-version.sh +++ b/scripts/ld-version.sh @@ -6,6 +6,6 @@ gsub(".*version ", ""); gsub("-.*", ""); split($1,a, "."); - print a[1]*100000000 + a[2]*1000000 + a[3]*10000; + print a[1]*10000 + a[2]*100 + a[3]; exit } From 302fdadeafe4be539f247abf25f61822e4a5a577 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 22 Jan 2021 12:02:34 +0100 Subject: [PATCH 126/506] ext: EXT4_KUNIT_TESTS should depend on EXT4_FS instead of selecting it EXT4_KUNIT_TESTS selects EXT4_FS, thus enabling an optional feature the user may not want to enable. Fix this by making the test depend on EXT4_FS instead. Fixes: 1cbeab1b242d16fd ("ext4: add kunit test for decoding extended timestamps") Reviewed-by: Randy Dunlap Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210122110234.2825685-1-geert@linux-m68k.org Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 619dd35ddd48..86699c8cab28 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -103,8 +103,7 @@ config EXT4_DEBUG config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS - select EXT4_FS - depends on KUNIT + depends on EXT4_FS && KUNIT default KUNIT_ALL_TESTS help This builds the ext4 KUnit tests. From 0a76945fd1ba2ab44da7b578b311efdfedf92e6c Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Tue, 9 Feb 2021 17:32:06 -0800 Subject: [PATCH 127/506] ext4: add .kunitconfig fragment to enable ext4-specific tests As of [1], we no longer want EXT4_KUNIT_TESTS and others to `select` their deps. This means it can get harder to get all the right things selected as we gain more tests w/ more deps over time. This patch (and [2]) proposes we store kunitconfig fragments in-tree to represent sets of tests. (N.B. right now we only have one ext4 test). There's still a discussion to be had about how to have a hierarchy of these files (e.g. if one wanted to test all of fs/, not just fs/ext4). But this fragment would likely be a leaf node and isn't blocked on deciding if we want `import` statements and the like. Usage ===== Before [2] (on its way to being merged): $ cp fs/ext4/.kunitconfig .kunit/ $ ./tools/testing/kunit/kunit.py run After [2]: $ ./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4/.kunitconfig ".kunitconfig" vs "kunitconfig" =============================== See also: commit 14ee5cfd4512 ("kunit: Rename 'kunitconfig' to '.kunitconfig'"). * The bit about .gitignore exluding it by default is now a con, however. * But there are a lot of directories with files that begin with "k" and so this could cause some annoyance w/ tab completion* * This is the name kunit.py expects right now, so some people are used to .kunitconfig over "kunitconfig" [1] https://lore.kernel.org/linux-ext4/20210122110234.2825685-1-geert@linux-m68k.org/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git/commit/?h=kunit&id=243180f5924ed27ea417db39feb7f9691777688e * 372/5556 directories isn't too much, but still not a small number: $ find -type f -name 'k*' | xargs dirname | sort -u | wc -l 372 Signed-off-by: Daniel Latypov Link: https://lore.kernel.org/r/20210210013206.136227-1-dlatypov@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/.kunitconfig | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 fs/ext4/.kunitconfig diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig new file mode 100644 index 000000000000..bf51da7cd9fc --- /dev/null +++ b/fs/ext4/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_KUNIT_TESTS=y From 1838b06bf01ac2b1b9ea808aa5962d5324b4da8f Mon Sep 17 00:00:00 2001 From: Ignacio Alvarado Date: Sat, 13 Feb 2021 00:14:52 +0000 Subject: [PATCH 128/506] selftests: kvm: add hardware_disable test This test launches 512 VMs in serial and kills them after a random amount of time. The test was original written to exercise KVM user notifiers in the context of1650b4ebc99d: - KVM: Disable irq while unregistering user notifier - https://lore.kernel.org/kvm/CACXrx53vkO=HKfwWwk+fVpvxcNjPrYmtDZ10qWxFvVX_PTGp3g@mail.gmail.com/ Recently, this test piqued my interest because it proved useful to for AMD SNP in exercising the "in-use" pages, described in APM section 15.36.12, "Running SNP-Active Virtual Machines". Signed-off-by: Ignacio Alvarado Signed-off-by: Marc Orr Message-Id: <20210213001452.1719001-1-marcorr@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/hardware_disable_test.c | 165 ++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 tools/testing/selftests/kvm/hardware_disable_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 3a84394829ea..32b87cc77c8e 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -33,6 +33,7 @@ /demand_paging_test /dirty_log_test /dirty_log_perf_test +/hardware_disable_test /kvm_create_max_vcpus /memslot_modification_stress_test /set_memory_region_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 8c8eda429576..a6d61f451f88 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += set_memory_region_test diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c new file mode 100644 index 000000000000..2f2eeb8a1d86 --- /dev/null +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This test is intended to reproduce a crash that happens when + * kvm_arch_hardware_disable is called and it attempts to unregister the user + * return notifiers. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kvm_util.h" + +#define VCPU_NUM 4 +#define SLEEPING_THREAD_NUM (1 << 4) +#define FORK_NUM (1ULL << 9) +#define DELAY_US_MAX 2000 +#define GUEST_CODE_PIO_PORT 4 + +sem_t *sem; + +/* Arguments for the pthreads */ +struct payload { + struct kvm_vm *vm; + uint32_t index; +}; + +static void guest_code(void) +{ + for (;;) + ; /* Some busy work */ + printf("Should not be reached.\n"); +} + +static void *run_vcpu(void *arg) +{ + struct payload *payload = (struct payload *)arg; + struct kvm_run *state = vcpu_state(payload->vm, payload->index); + + vcpu_run(payload->vm, payload->index); + + TEST_ASSERT(false, "%s: exited with reason %d: %s\n", + __func__, state->exit_reason, + exit_reason_str(state->exit_reason)); + pthread_exit(NULL); +} + +static void *sleeping_thread(void *arg) +{ + int fd; + + while (true) { + fd = open("/dev/null", O_RDWR); + close(fd); + } + TEST_ASSERT(false, "%s: exited\n", __func__); + pthread_exit(NULL); +} + +static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr, + void *(*f)(void *), void *arg) +{ + int r; + + r = pthread_create(thread, attr, f, arg); + TEST_ASSERT(r == 0, "%s: failed to create thread", __func__); +} + +static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set) +{ + int r; + + r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set); + TEST_ASSERT(r == 0, "%s: failed set affinity", __func__); +} + +static inline void check_join(pthread_t thread, void **retval) +{ + int r; + + r = pthread_join(thread, retval); + TEST_ASSERT(r == 0, "%s: failed to join thread", __func__); +} + +static void run_test(uint32_t run) +{ + struct kvm_vm *vm; + cpu_set_t cpu_set; + pthread_t threads[VCPU_NUM]; + pthread_t throw_away; + struct payload payloads[VCPU_NUM]; + void *b; + uint32_t i, j; + + CPU_ZERO(&cpu_set); + for (i = 0; i < VCPU_NUM; i++) + CPU_SET(i, &cpu_set); + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_create_irqchip(vm); + + fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run); + for (i = 0; i < VCPU_NUM; ++i) { + vm_vcpu_add_default(vm, i, guest_code); + payloads[i].vm = vm; + payloads[i].index = i; + + check_create_thread(&threads[i], NULL, run_vcpu, + (void *)&payloads[i]); + check_set_affinity(threads[i], &cpu_set); + + for (j = 0; j < SLEEPING_THREAD_NUM; ++j) { + check_create_thread(&throw_away, NULL, sleeping_thread, + (void *)NULL); + check_set_affinity(throw_away, &cpu_set); + } + } + fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run); + sem_post(sem); + for (i = 0; i < VCPU_NUM; ++i) + check_join(threads[i], &b); + /* Should not be reached */ + TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); +} + +int main(int argc, char **argv) +{ + uint32_t i; + int s, r; + pid_t pid; + + sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0); + sem_unlink("vm_sem"); + + for (i = 0; i < FORK_NUM; ++i) { + pid = fork(); + TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__); + if (pid == 0) + run_test(i); /* This function always exits */ + + fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i); + sem_wait(sem); + r = (rand() % DELAY_US_MAX) + 1; + fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r); + usleep(r); + r = waitpid(pid, &s, WNOHANG); + TEST_ASSERT(r != pid, + "%s: [%d] child exited unexpectedly status: [%d]", + __func__, i, s); + fprintf(stderr, "%s: [%d] killing child\n", __func__, i); + kill(pid, SIGKILL); + } + + sem_destroy(sem); + exit(0); +} From bcd22e145b9a65dd603c7b3d8079e948922787e4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 Feb 2021 11:42:01 -0500 Subject: [PATCH 129/506] selftests: kvm: avoid uninitialized variable warning The variable in practice will never be uninitialized, because the loop will always go through at least one iteration. In case it would not, make vcpu_get_cpuid report an assertion failure. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index de0c76177d02..a8906e60a108 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -720,7 +720,8 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct kvm_cpuid2 *cpuid; - int rc, max_ent; + int max_ent; + int rc = -1; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); From aec6c60a01d3a3170242d6a99372a388e1136dc6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 16 Jan 2021 08:35:42 +0900 Subject: [PATCH 130/506] kbuild: check the minimum compiler version in Kconfig Paul Gortmaker reported a regression in the GCC version check. [1] If you use GCC 4.8, the build breaks before showing the error message "error Sorry, your version of GCC is too old - please use 4.9 or newer." I do not want to apply his fix-up since it implies we would not be able to remove any cc-option test. Anyway, I admit checking the GCC version in is too late. Almost at the same time, Linus also suggested to move the compiler version error to Kconfig time. [2] I unified the two similar scripts, gcc-version.sh and clang-version.sh into cc-version.sh. The old scripts invoked the compiler multiple times (3 times for gcc-version.sh, 4 times for clang-version.sh). I refactored the code so the new one invokes the compiler just once, and also tried my best to use shell-builtin commands where possible. The new script runs faster. $ time ./scripts/clang-version.sh clang 120000 real 0m0.029s user 0m0.012s sys 0m0.021s $ time ./scripts/cc-version.sh clang Clang 120000 real 0m0.009s user 0m0.006s sys 0m0.004s cc-version.sh also shows an error message if the compiler is too old: $ make defconfig CC=clang-9 *** Default configuration is based on 'x86_64_defconfig' *** *** Compiler is too old. *** Your Clang version: 9.0.1 *** Minimum Clang version: 10.0.1 *** scripts/Kconfig.include:46: Sorry, this compiler is not supported. make[1]: *** [scripts/kconfig/Makefile:81: defconfig] Error 1 make: *** [Makefile:602: defconfig] Error 2 The new script takes care of ICC because we have although I am not sure if building the kernel with ICC is well-supported. [1]: https://lore.kernel.org/r/20210110190807.134996-1-paul.gortmaker@windriver.com [2]: https://lore.kernel.org/r/CAHk-=wh-+TMHPTFo1qs-MYyK7tZh-OQovA=pP3=e06aCVp6_kA@mail.gmail.com Fixes: 87de84c9140e ("kbuild: remove cc-option test of -Werror=date-time") Reported-by: Paul Gortmaker Suggested-by: Linus Torvalds Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Miguel Ojeda Tested-by: Miguel Ojeda Tested-by: Sedat Dilek Signed-off-by: Masahiro Yamada --- MAINTAINERS | 1 - include/linux/compiler-clang.h | 10 ----- include/linux/compiler-gcc.h | 11 ----- init/Kconfig | 9 ++-- scripts/Kconfig.include | 6 +++ scripts/cc-version.sh | 82 ++++++++++++++++++++++++++++++++++ scripts/clang-version.sh | 19 -------- scripts/gcc-version.sh | 20 --------- 8 files changed, 93 insertions(+), 65 deletions(-) create mode 100755 scripts/cc-version.sh delete mode 100755 scripts/clang-version.sh delete mode 100755 scripts/gcc-version.sh diff --git a/MAINTAINERS b/MAINTAINERS index 667d03852191..df820969be1f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4314,7 +4314,6 @@ C: irc://chat.freenode.net/clangbuiltlinux F: Documentation/kbuild/llvm.rst F: include/linux/compiler-clang.h F: scripts/clang-tools/ -F: scripts/clang-version.sh F: scripts/lld-version.sh K: \b(?i:clang|llvm)\b diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 98cff1b4b088..04c0a5a717f7 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -3,16 +3,6 @@ #error "Please don't include directly, include instead." #endif -#define CLANG_VERSION (__clang_major__ * 10000 \ - + __clang_minor__ * 100 \ - + __clang_patchlevel__) - -#if CLANG_VERSION < 100001 -#ifndef __BPF_TRACING__ -# error Sorry, your version of Clang is too old - please use 10.0.1 or newer. -#endif -#endif - /* Compiler specific definitions for Clang compiler */ /* same as gcc, this was present in clang-2.6 so we can assume it works diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 555ab0fddbef..48750243db4c 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -10,17 +10,6 @@ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) -/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ -#if GCC_VERSION < 40900 -# error Sorry, your version of GCC is too old - please use 4.9 or newer. -#elif defined(CONFIG_ARM64) && GCC_VERSION < 50100 -/* - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293 - * https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk - */ -# error Sorry, your version of GCC is too old - please use 5.1 or newer. -#endif - /* * This macro obfuscates arithmetic on a variable address so that gcc * shouldn't recognize the original var, and make assumptions about it. diff --git a/init/Kconfig b/init/Kconfig index 29ad68325028..7bcfa24524c2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -26,11 +26,11 @@ config CC_VERSION_TEXT and then every file will be rebuilt. config CC_IS_GCC - def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc) + def_bool $(success,test "$(cc-name)" = GCC) config GCC_VERSION int - default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC + default $(cc-version) if CC_IS_GCC default 0 config LD_VERSION @@ -38,14 +38,15 @@ config LD_VERSION default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) config CC_IS_CLANG - def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q clang) + def_bool $(success,test "$(cc-name)" = Clang) config LD_IS_LLD def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD) config CLANG_VERSION int - default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) + default $(cc-version) if CC_IS_CLANG + default 0 config LLD_VERSION int diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include index a5fe72c504ff..0228cb9c74aa 100644 --- a/scripts/Kconfig.include +++ b/scripts/Kconfig.include @@ -39,6 +39,12 @@ as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler $(error-if,$(failure,command -v $(CC)),compiler '$(CC)' not found) $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found) +# Get the compiler name, version, and error out if it is not supported. +cc-info := $(shell,$(srctree)/scripts/cc-version.sh $(CC)) +$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not supported.) +cc-name := $(shell,set -- $(cc-info) && echo $1) +cc-version := $(shell,set -- $(cc-info) && echo $2) + # Fail if the linker is gold as it's not capable of linking the kernel proper $(error-if,$(success, $(LD) -v | grep -q gold), gold linker '$(LD)' not supported) diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh new file mode 100755 index 000000000000..3f2ee885b116 --- /dev/null +++ b/scripts/cc-version.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Print the compiler name and its version in a 5 or 6-digit form. +# Also, perform the minimum version check. + +set -e + +# When you raise the minimum compiler version, please update +# Documentation/process/changes.rst as well. +gcc_min_version=4.9.0 +clang_min_version=10.0.1 +icc_min_version=16.0.3 # temporary + +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293 +# https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk +if [ "$SRCARCH" = arm64 ]; then + gcc_min_version=5.1.0 +fi + +# Print the compiler name and some version components. +get_compiler_info() +{ + cat <<- EOF | "$@" -E -P -x c - 2>/dev/null + #if defined(__clang__) + Clang __clang_major__ __clang_minor__ __clang_patchlevel__ + #elif defined(__INTEL_COMPILER) + ICC __INTEL_COMPILER __INTEL_COMPILER_UPDATE + #elif defined(__GNUC__) + GCC __GNUC__ __GNUC_MINOR__ __GNUC_PATCHLEVEL__ + #else + unknown + #endif + EOF +} + +# Convert the version string x.y.z to a canonical 5 or 6-digit form. +get_canonical_version() +{ + IFS=. + set -- $1 + echo $((10000 * $1 + 100 * $2 + $3)) +} + +# $@ instead of $1 because multiple words might be given, e.g. CC="ccache gcc". +orig_args="$@" +set -- $(get_compiler_info "$@") + +name=$1 + +case "$name" in +GCC) + version=$2.$3.$4 + min_version=$gcc_min_version + ;; +Clang) + version=$2.$3.$4 + min_version=$clang_min_version + ;; +ICC) + version=$(($2 / 100)).$(($2 % 100)).$3 + min_version=$icc_min_version + ;; +*) + echo "$orig_args: unknown compiler" >&2 + exit 1 + ;; +esac + +cversion=$(get_canonical_version $version) +min_cversion=$(get_canonical_version $min_version) + +if [ "$cversion" -lt "$min_cversion" ]; then + echo >&2 "***" + echo >&2 "*** Compiler is too old." + echo >&2 "*** Your $name version: $version" + echo >&2 "*** Minimum $name version: $min_version" + echo >&2 "***" + exit 1 +fi + +echo $name $cversion diff --git a/scripts/clang-version.sh b/scripts/clang-version.sh deleted file mode 100755 index 6fabf0695761..000000000000 --- a/scripts/clang-version.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# clang-version clang-command -# -# Print the compiler version of `clang-command' in a 5 or 6-digit form -# such as `50001' for clang-5.0.1 etc. - -compiler="$*" - -if ! ( $compiler --version | grep -q clang) ; then - echo 0 - exit 1 -fi - -MAJOR=$(echo __clang_major__ | $compiler -E -x c - | tail -n 1) -MINOR=$(echo __clang_minor__ | $compiler -E -x c - | tail -n 1) -PATCHLEVEL=$(echo __clang_patchlevel__ | $compiler -E -x c - | tail -n 1) -printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL diff --git a/scripts/gcc-version.sh b/scripts/gcc-version.sh deleted file mode 100755 index ae353432539b..000000000000 --- a/scripts/gcc-version.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# gcc-version gcc-command -# -# Print the gcc version of `gcc-command' in a 5 or 6-digit form -# such as `29503' for gcc-2.95.3, `30301' for gcc-3.3.1, etc. - -compiler="$*" - -if [ ${#compiler} -eq 0 ]; then - echo "Error: No compiler specified." >&2 - printf "Usage:\n\t$0 \n" >&2 - exit 1 -fi - -MAJOR=$(echo __GNUC__ | $compiler -E -x c - | tail -n 1) -MINOR=$(echo __GNUC_MINOR__ | $compiler -E -x c - | tail -n 1) -PATCHLEVEL=$(echo __GNUC_PATCHLEVEL__ | $compiler -E -x c - | tail -n 1) -printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL From ab37d5a43162ab424e36be03684881df438378a7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 16 Jan 2021 08:43:02 +0900 Subject: [PATCH 131/506] genksyms: make source_file a local variable in lexer This is only used in yylex() in lex.l Signed-off-by: Masahiro Yamada --- scripts/genksyms/genksyms.c | 2 +- scripts/genksyms/genksyms.h | 2 +- scripts/genksyms/lex.l | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c index 23eff234184f..4827c5abe5b7 100644 --- a/scripts/genksyms/genksyms.c +++ b/scripts/genksyms/genksyms.c @@ -29,7 +29,7 @@ static struct symbol *symtab[HASH_BUCKETS]; static FILE *debugfile; int cur_line = 1; -char *cur_filename, *source_file; +char *cur_filename; int in_source_file; static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types, diff --git a/scripts/genksyms/genksyms.h b/scripts/genksyms/genksyms.h index 2bcdb9bebab4..21ed2ec2d98c 100644 --- a/scripts/genksyms/genksyms.h +++ b/scripts/genksyms/genksyms.h @@ -47,7 +47,7 @@ typedef struct string_list **yystype; #define YYSTYPE yystype extern int cur_line; -extern char *cur_filename, *source_file; +extern char *cur_filename; extern int in_source_file; struct symbol *find_symbol(const char *name, enum symbol_type ns, int exact); diff --git a/scripts/genksyms/lex.l b/scripts/genksyms/lex.l index ae76472efc43..9e88c100fc28 100644 --- a/scripts/genksyms/lex.l +++ b/scripts/genksyms/lex.l @@ -125,6 +125,7 @@ yylex(void) static int suppress_type_lookup, dont_want_brace_phrase; static struct string_list *next_node; + static char *source_file; int token, count = 0; struct string_list *cur_node; From e66e13a3c97486416f65343cd66760645b1d27c7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 16 Jan 2021 08:43:03 +0900 Subject: [PATCH 132/506] genksyms: remove dead code for ST_TABLE_* No one sets lexstate to ST_TABLE_*. It is is very old code, and I do not know what was the plan at that time. Let's remove the dead code. Signed-off-by: Masahiro Yamada --- scripts/genksyms/lex.l | 54 ------------------------------------------ 1 file changed, 54 deletions(-) diff --git a/scripts/genksyms/lex.l b/scripts/genksyms/lex.l index 9e88c100fc28..9cb075cf6a34 100644 --- a/scripts/genksyms/lex.l +++ b/scripts/genksyms/lex.l @@ -119,8 +119,6 @@ yylex(void) static enum { ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1, ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT, - ST_TABLE_1, ST_TABLE_2, ST_TABLE_3, ST_TABLE_4, - ST_TABLE_5, ST_TABLE_6 } lexstate = ST_NOTSTARTED; static int suppress_type_lookup, dont_want_brace_phrase; @@ -427,58 +425,6 @@ repeat: } break; - case ST_TABLE_1: - goto repeat; - - case ST_TABLE_2: - if (token == IDENT && yyleng == 1 && yytext[0] == 'X') - { - token = EXPORT_SYMBOL_KEYW; - lexstate = ST_TABLE_5; - APP; - break; - } - lexstate = ST_TABLE_6; - /* FALLTHRU */ - - case ST_TABLE_6: - switch (token) - { - case '{': case '[': case '(': - ++count; - break; - case '}': case ']': case ')': - --count; - break; - case ',': - if (count == 0) - lexstate = ST_TABLE_2; - break; - }; - goto repeat; - - case ST_TABLE_3: - goto repeat; - - case ST_TABLE_4: - if (token == ';') - lexstate = ST_NORMAL; - goto repeat; - - case ST_TABLE_5: - switch (token) - { - case ',': - token = ';'; - lexstate = ST_TABLE_2; - APP; - break; - default: - APP; - break; - } - break; - default: exit(1); } From 13940738c2647bac783439a800fd25ead362a110 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 16 Jan 2021 08:43:04 +0900 Subject: [PATCH 133/506] genksyms: remove useless case DOTS This switch statement does not list out all the cases. Since the 'default' covers all the rest, the 'DOTS' case is unneeded. Signed-off-by: Masahiro Yamada --- scripts/genksyms/lex.l | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/genksyms/lex.l b/scripts/genksyms/lex.l index 9cb075cf6a34..a4d7495eaf75 100644 --- a/scripts/genksyms/lex.l +++ b/scripts/genksyms/lex.l @@ -234,7 +234,6 @@ repeat: lexstate = ST_EXPRESSION; break; - case DOTS: default: APP; break; From 3d277907c2ff36b2057c836023ee46f4f79e691c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 17 Jan 2021 20:51:56 +0900 Subject: [PATCH 134/506] kbuild: doc: remove "Objects which export symbols" section EXPORT_SYMBOL is unrelated to makefiles. No need to mention it. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 300d8edcb994..aee9d9019238 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -12,7 +12,7 @@ This document describes the Linux kernel Makefiles. --- 3.1 Goal definitions --- 3.2 Built-in object goals - obj-y --- 3.3 Loadable module goals - obj-m - --- 3.4 Objects which export symbols + --- 3.4 --- 3.5 Library file goals - lib-y --- 3.6 Descending down in directories --- 3.7 Non-builtin vmlinux targets - extra-y @@ -247,12 +247,6 @@ more details, with real examples. kbuild will build an ext2.o file for you out of the individual parts and then link this into built-in.a, as you would expect. -3.4 Objects which export symbols --------------------------------- - - No special notation is required in the makefiles for - modules exporting symbols. - 3.5 Library file goals - lib-y ------------------------------ From 0dd77e957a005fa41bf36cdbb0ce841ef7edcdb4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Jan 2021 13:04:03 +0900 Subject: [PATCH 135/506] kbuild: stop removing stale file Revert commit 223c24a7dba9 ("kbuild: Automatically remove stale file"). It was more than 6 years ago. I do not expect anybody to start git-bisect for such a big window. Signed-off-by: Masahiro Yamada --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index f0cfcc7e0886..c1cac349ba4e 100644 --- a/Makefile +++ b/Makefile @@ -257,7 +257,6 @@ export building_out_of_srctree srctree objtree VPATH # of make so .config is not included in this case either (for *config). version_h := include/generated/uapi/linux/version.h -old_version_h := include/linux/version.h clean-targets := %clean mrproper cleandocs no-dot-config-targets := $(clean-targets) \ @@ -1253,7 +1252,6 @@ endef $(version_h): FORCE $(call filechk,version.h) - $(Q)rm -f $(old_version_h) include/generated/utsrelease.h: include/config/kernel.release FORCE $(call filechk,utsrelease.h) From 2047ace96679a146c8573520a080f9dfa06a2c98 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Jan 2021 15:23:51 +0900 Subject: [PATCH 136/506] kbuild: use always-y instead of extra-y As commit d0e628cd817f ("kbuild: doc: clarify the difference between extra-y and always-y") explained, extra-y should be used for listing the prerequisites of vmlinux. These targets are not related to vmlinux. always-y is a better fix. Signed-off-by: Masahiro Yamada Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/Makefile | 8 ++++---- drivers/gpu/drm/i915/Makefile | 2 +- scripts/Makefile.lib | 10 +++++----- scripts/gdb/linux/Makefile | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile index 8f2b054bec5a..90fcad98984d 100644 --- a/Documentation/devicetree/bindings/Makefile +++ b/Documentation/devicetree/bindings/Makefile @@ -78,10 +78,10 @@ $(obj)/processed-schema.json: $(DT_SCHEMA_FILES) check_dtschema_version FORCE endif -extra-$(CHECK_DT_BINDING) += processed-schema-examples.json -extra-$(CHECK_DTBS) += processed-schema.json -extra-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES)) -extra-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES)) +always-$(CHECK_DT_BINDING) += processed-schema-examples.json +always-$(CHECK_DTBS) += processed-schema.json +always-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES)) +always-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES)) # Hack: avoid 'Argument list too long' error for 'make clean'. Remove most of # build artifacts here before they are processed by scripts/Makefile.clean diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 6d9e81ea67f4..938221894d0c 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -294,7 +294,7 @@ no-header-test := \ gvt/mpt.h \ gvt/scheduler.h -extra-$(CONFIG_DRM_I915_WERROR) += \ +always-$(CONFIG_DRM_I915_WERROR) += \ $(patsubst %.h,%.hdrtest, $(filter-out $(no-header-test), \ $(shell cd $(srctree)/$(src) && find * -name '*.h'))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 213677a5ed33..e10b675165cd 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -81,12 +81,12 @@ always-y += $(userprogs-always-y) $(userprogs-always-m) # DTB # If CONFIG_OF_ALL_DTBS is enabled, all DT blobs are built -extra-y += $(dtb-y) -extra-$(CONFIG_OF_ALL_DTBS) += $(dtb-) +always-y += $(dtb-y) +always-$(CONFIG_OF_ALL_DTBS) += $(dtb-) ifneq ($(CHECK_DTBS),) -extra-y += $(patsubst %.dtb,%.dt.yaml, $(dtb-y)) -extra-$(CONFIG_OF_ALL_DTBS) += $(patsubst %.dtb,%.dt.yaml, $(dtb-)) +always-y += $(patsubst %.dtb,%.dt.yaml, $(dtb-y)) +always-$(CONFIG_OF_ALL_DTBS) += $(patsubst %.dtb,%.dt.yaml, $(dtb-)) endif # Add subdir path @@ -247,7 +247,7 @@ $(obj)/%: $(src)/%_shipped # target: source(s) FORCE # $(if_changed,ld/objcopy/gzip) # -# and add target to extra-y so that we know we have to +# and add target to 'targets' so that we know we have to # read in the saved command line # Linking diff --git a/scripts/gdb/linux/Makefile b/scripts/gdb/linux/Makefile index 124755087510..48941faa6ea6 100644 --- a/scripts/gdb/linux/Makefile +++ b/scripts/gdb/linux/Makefile @@ -7,7 +7,7 @@ symlinks := $(patsubst $(srctree)/$(src)/%,%,$(wildcard $(srctree)/$(src)/*.py)) quiet_cmd_symlink = SYMLINK $@ cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(abspath $(srctree))/$(src)/%,$@) $@ -extra-y += $(symlinks) +always-y += $(symlinks) $(addprefix $(obj)/, $(symlinks)): FORCE $(call if_changed,symlink) @@ -18,7 +18,7 @@ quiet_cmd_gen_constants_py = GEN $@ $(CPP) -E -x c -P $(c_flags) $< > $@ ;\ sed -i '1,//d;' $@ -extra-y += constants.py +always-y += constants.py $(obj)/constants.py: $(src)/constants.py.in FORCE $(call if_changed_dep,gen_constants_py) From 1c3fae740aabaeb4d6b4174fc189592eba1b77d0 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Fri, 22 Jan 2021 11:27:17 -0800 Subject: [PATCH 137/506] Kbuild: Make composite object searching more generic Reduce repeated logic around expanding composite objects. Signed-off-by: Elliot Berman Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index e10b675165cd..6f248ff91982 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -56,15 +56,19 @@ else obj-y := $(filter-out %/, $(obj-y)) endif +# Expand $(foo-objs) $(foo-y) by calling $(call suffix-search,foo.o,-objs -y) +suffix-search = $(foreach s,$(2),$($(1:.o=$s))) # If $(foo-objs), $(foo-y), $(foo-m), or $(foo-) exists, foo.o is a composite object -multi-used-y := $(sort $(foreach m,$(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))), $(m)))) -multi-used-m := $(sort $(foreach m,$(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)) $($(m:.o=-))), $(m)))) +multi-search = $(sort $(foreach m,$(1), $(if $(strip $(call suffix-search,$(m),$(2) -)), $(m)))) +multi-used-y := $(call multi-search,$(obj-y),-objs -y) +multi-used-m := $(call multi-search,$(obj-m),-objs -y -m) multi-used := $(multi-used-y) $(multi-used-m) # Replace multi-part objects by their individual parts, # including built-in.a from subdirectories -real-obj-y := $(foreach m, $(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))),$($(m:.o=-objs)) $($(m:.o=-y)),$(m))) -real-obj-m := $(foreach m, $(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)) $($(m:.o=-))),$($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)),$(m))) +real-search = $(foreach m,$(1), $(if $(strip $(call suffix-search,$(m),$(2) -)),$(call suffix-search,$(m),$(2)),$(m))) +real-obj-y := $(call real-search, $(obj-y),-objs -y) +real-obj-m := $(call real-search, $(obj-m),-objs -y -m) always-y += $(always-m) From 3c4fa46b30c551b1df2fb1574a684f68bc22067c Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Feb 2021 12:22:18 -0800 Subject: [PATCH 138/506] vmlinux.lds.h: add DWARF v5 sections We expect toolchains to produce these new debug info sections as part of DWARF v5. Add explicit placements to prevent the linker warnings from --orphan-section=warn. Compilers may produce such sections with explicit -gdwarf-5, or based on the implicit default version of DWARF when -g is used via DEBUG_INFO. This implicit default changes over time, and has changed to DWARF v5 with GCC 11. .debug_sup was mentioned in review, but without compilers producing it today, let's wait to add it until it becomes necessary. Cc: stable@vger.kernel.org Link: https://bugzilla.redhat.com/show_bug.cgi?id=1922707 Reported-by: Chris Murphy Suggested-by: Fangrui Song Reviewed-by: Nathan Chancellor Reviewed-by: Mark Wielaard Tested-by: Sedat Dilek Signed-off-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- include/asm-generic/vmlinux.lds.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b2b3d81b1535..b3d4ceaf19ed 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -828,8 +828,13 @@ /* DWARF 4 */ \ .debug_types 0 : { *(.debug_types) } \ /* DWARF 5 */ \ + .debug_addr 0 : { *(.debug_addr) } \ + .debug_line_str 0 : { *(.debug_line_str) } \ + .debug_loclists 0 : { *(.debug_loclists) } \ .debug_macro 0 : { *(.debug_macro) } \ - .debug_addr 0 : { *(.debug_addr) } + .debug_names 0 : { *(.debug_names) } \ + .debug_rnglists 0 : { *(.debug_rnglists) } \ + .debug_str_offsets 0 : { *(.debug_str_offsets) } /* Stabs debugging sections. */ #define STABS_DEBUG \ From a66049e2cf0ef166dba5bafdbb3062287fc965ad Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Feb 2021 12:22:19 -0800 Subject: [PATCH 139/506] Kbuild: make DWARF version a choice Adds a default CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT which allows the implicit default version of DWARF emitted by the toolchain to progress over time. Modifies CONFIG_DEBUG_INFO_DWARF4 to be a member of a choice, making it mutually exclusive with CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT. Users may want to select this if they are using a newer toolchain, but have consumers of the DWARF debug info that aren't yet ready for newer DWARF versions' debug info. Does so in a way that's forward compatible with existing configs, and makes adding future versions more straightforward. This patch does not change the current behavior or selection of DWARF version for users upgrading to kernels with this patch. GCC since ~4.8 has defaulted to DWARF v4 implicitly, and GCC 11 has bumped this to v5. Remove the Kconfig help text about DWARF v4 being larger. It's empirically false for the latest toolchains for x86_64 defconfig, has no point of reference (I suspect it was DWARF v2 but that's stil empirically false), and debug info size is not a qualatative measure. Suggested-by: Arvind Sankar Suggested-by: Fangrui Song Suggested-by: Jakub Jelinek Suggested-by: Mark Wielaard Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Tested-by: Sedat Dilek Signed-off-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- Makefile | 5 +++-- lib/Kconfig.debug | 32 ++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index c1cac349ba4e..c567c4343880 100644 --- a/Makefile +++ b/Makefile @@ -830,8 +830,9 @@ ifneq ($(LLVM_IAS),1) KBUILD_AFLAGS += -Wa,-gdwarf-2 endif -ifdef CONFIG_DEBUG_INFO_DWARF4 -DEBUG_CFLAGS += -gdwarf-4 +ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT +dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4 +DEBUG_CFLAGS += -gdwarf-$(dwarf-version-y) endif ifdef CONFIG_DEBUG_INFO_REDUCED diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9cf4d12b81fb..3555edcfd4ab 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -256,13 +256,33 @@ config DEBUG_INFO_SPLIT to know about the .dwo files and include them. Incompatible with older versions of ccache. -config DEBUG_INFO_DWARF4 - bool "Generate dwarf4 debuginfo" +choice + prompt "DWARF version" help - Generate dwarf4 debug info. This requires recent versions - of gcc and gdb. It makes the debug information larger. - But it significantly improves the success of resolving - variables in gdb on optimized code. + Which version of DWARF debug info to emit. + +config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + bool "Rely on the toolchain's implicit default DWARF version" + help + The implicit default version of DWARF debug info produced by a + toolchain changes over time. + + This can break consumers of the debug info that haven't upgraded to + support newer revisions, and prevent testing newer versions, but + those should be less common scenarios. + + If unsure, say Y. + +config DEBUG_INFO_DWARF4 + bool "Generate DWARF Version 4 debuginfo" + help + Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. + + If you have consumers of DWARF debug info that are not ready for + newer revisions of DWARF, you may wish to choose this or have your + config select this. + +endchoice # "DWARF version" config DEBUG_INFO_BTF bool "Generate BTF typeinfo" From 98cd6f521f1016171e9e263effc7d6edfbf61da1 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Feb 2021 12:22:20 -0800 Subject: [PATCH 140/506] Kconfig: allow explicit opt in to DWARF v5 DWARF v5 is the latest standard of the DWARF debug info format. GCC 11 will change the implicit default DWARF version, if left unspecified, to DWARF v5. Allow users of Clang and older versions of GCC that have not changed the implicit default DWARF version to DWARF v5 to opt in. This can help testing consumers of DWARF debug info in preparation of v5 becoming more widespread, as well as result in significant binary size savings of the pre-stripped vmlinux image. DWARF5 wins significantly in terms of size when mixed with compression (CONFIG_DEBUG_INFO_COMPRESSED). 363M vmlinux.clang12.dwarf5.compressed 434M vmlinux.clang12.dwarf4.compressed 439M vmlinux.clang12.dwarf2.compressed 457M vmlinux.clang12.dwarf5 536M vmlinux.clang12.dwarf4 548M vmlinux.clang12.dwarf2 515M vmlinux.gcc10.2.dwarf5.compressed 599M vmlinux.gcc10.2.dwarf4.compressed 624M vmlinux.gcc10.2.dwarf2.compressed 630M vmlinux.gcc10.2.dwarf5 765M vmlinux.gcc10.2.dwarf4 809M vmlinux.gcc10.2.dwarf2 Though the quality of debug info is harder to quantify; size is not a proxy for quality. Jakub notes: One thing is GCC DWARF-5 support, that is whether the compiler will support -gdwarf-5 flag, and that support should be there from GCC 7 onwards. All [GCC] 5.1 - 6.x did was start accepting -gdwarf-5 as experimental option that enabled some small DWARF subset (initially only a few DW_LANG_* codes newly added to DWARF5 drafts). Only GCC 7 (released after DWARF 5 has been finalized) started emitting DWARF5 section headers and got most of the DWARF5 changes in... Another separate thing is whether the assembler does support the -gdwarf-5 option (i.e. if you can compile assembler files with -Wa,-gdwarf-5) ... That option is about whether the assembler will emit DWARF5 or DWARF2 .debug_line. It is fine to compile C sources with -gdwarf-5 and use DWARF2 .debug_line for assembler files if as doesn't support it. Version check GCC so that we don't need to worry about the difference in command line args between GNU readelf and llvm-readelf/llvm-dwarfdump to validate the DWARF Version in the assembler feature detection script. Most issues with clang produced assembler were fixed in binutils 2.35.1, but 2.35.2 fixed issues related to requiring the flag -Wa,-gdwarf-5 explicitly. The added shell script test checks for the latter, and is only required when using clang without its integrated assembler, though we use for clang regardless as we do not yet have a way to query the assembler from Kconfig. Disabled for now if CONFIG_DEBUG_INFO_BTF is set; pahole doesn't yet recognize the new additions to the DWARF debug info. This only modifies the DWARF version emitted by the compiler, not the assembler. The DWARF version of a binary can be validated with: $ llvm-dwarfdump | head -n 4 | grep version or $ readelf --debug-dump=info 2>/dev/null | grep Version Parts of the tree don't reuse DEBUG_CFLAGS as they should; such cleanup is left as a follow up. Link: http://www.dwarfstd.org/doc/DWARF5.pdf Link: https://bugzilla.redhat.com/show_bug.cgi?id=1922707 Reported-by: Sedat Dilek Suggested-by: Arvind Sankar Suggested-by: Caroline Tice Suggested-by: Fangrui Song Suggested-by: Jakub Jelinek Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Tested-by: Sedat Dilek # LLVM/Clang v12.0.0-rc1 x86-64 Signed-off-by: Masahiro Yamada --- Makefile | 1 + lib/Kconfig.debug | 18 ++++++++++++++++++ scripts/test_dwarf5_support.sh | 8 ++++++++ 3 files changed, 27 insertions(+) create mode 100755 scripts/test_dwarf5_support.sh diff --git a/Makefile b/Makefile index c567c4343880..681bdb5d2f41 100644 --- a/Makefile +++ b/Makefile @@ -832,6 +832,7 @@ endif ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4 +dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5 DEBUG_CFLAGS += -gdwarf-$(dwarf-version-y) endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3555edcfd4ab..ba8596464596 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -282,6 +282,24 @@ config DEBUG_INFO_DWARF4 newer revisions of DWARF, you may wish to choose this or have your config select this. +config DEBUG_INFO_DWARF5 + bool "Generate DWARF Version 5 debuginfo" + depends on GCC_VERSION >= 50000 || CC_IS_CLANG + depends on CC_IS_GCC || $(success,$(srctree)/scripts/test_dwarf5_support.sh $(CC) $(CLANG_FLAGS)) + depends on !DEBUG_INFO_BTF + help + Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc + 5.0+ accepts the -gdwarf-5 flag but only had partial support for some + draft features until 7.0), and gdb 8.0+. + + Changes to the structure of debug info in Version 5 allow for around + 15-18% savings in resulting image and debug info section sizes as + compared to DWARF Version 4. DWARF Version 5 standardizes previous + extensions such as accelerators for symbol indexing and the format + for fission (.dwo/.dwp) files. Users may not want to select this + config if they rely on tooling that has not yet been updated to + support DWARF Version 5. + endchoice # "DWARF version" config DEBUG_INFO_BTF diff --git a/scripts/test_dwarf5_support.sh b/scripts/test_dwarf5_support.sh new file mode 100755 index 000000000000..c46e2456b47a --- /dev/null +++ b/scripts/test_dwarf5_support.sh @@ -0,0 +1,8 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Test that the assembler doesn't need -Wa,-gdwarf-5 when presented with DWARF +# v5 input, such as `.file 0` and `md5 0x00`. Should be fixed in GNU binutils +# 2.35.2. https://sourceware.org/bugzilla/show_bug.cgi?id=25611 +echo '.file 0 "filename" md5 0x7a0b65214090b6693bd1dc24dd248245' | \ + $* -gdwarf-5 -Wno-unused-command-line-argument -c -x assembler -o /dev/null - From 9b82f13e7ef316cdc0a8858f1349f4defce3f9e0 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 5 Feb 2021 22:50:32 -0500 Subject: [PATCH 141/506] kbuild: clamp SUBLEVEL to 255 Right now if SUBLEVEL becomes larger than 255 it will overflow into the territory of PATCHLEVEL, causing havoc in userspace that tests for specific kernel version. While userspace code tests for MAJOR and PATCHLEVEL, it doesn't test SUBLEVEL at any point as ABI changes don't happen in the context of stable tree. Thus, to avoid overflows, simply clamp SUBLEVEL to it's maximum value in the context of LINUX_VERSION_CODE. This does not affect "make kernelversion" and such. Signed-off-by: Sasha Levin Signed-off-by: Masahiro Yamada --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 681bdb5d2f41..12607d389148 100644 --- a/Makefile +++ b/Makefile @@ -1247,9 +1247,15 @@ define filechk_utsrelease.h endef define filechk_version.h - echo \#define LINUX_VERSION_CODE $(shell \ - expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 0$(SUBLEVEL)); \ - echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))' + if [ $(SUBLEVEL) -gt 255 ]; then \ + echo \#define LINUX_VERSION_CODE $(shell \ + expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 255); \ + else \ + echo \#define LINUX_VERSION_CODE $(shell \ + expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \ + fi; \ + echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + \ + ((c) > 255 ? 255 : (c)))' endef $(version_h): FORCE From 88a686728b3739d3598851e729c0e81f194e5c53 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 12 Feb 2021 11:29:24 -0500 Subject: [PATCH 142/506] kbuild: simplify access to the kernel's version Instead of storing the version in a single integer and having various kernel (and userspace) code how it's constructed, export individual (major, patchlevel, sublevel) components and simplify kernel code that uses it. This should also make it easier on userspace. Signed-off-by: Sasha Levin Acked-by: Greg Kroah-Hartman Signed-off-by: Masahiro Yamada --- Makefile | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++-- drivers/usb/core/hcd.c | 4 ++-- drivers/usb/gadget/udc/aspeed-vhub/hub.c | 4 ++-- include/linux/usb/composite.h | 4 ++-- kernel/sys.c | 2 +- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 12607d389148..1fdd44fe1659 100644 --- a/Makefile +++ b/Makefile @@ -1255,7 +1255,10 @@ define filechk_version.h expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \ fi; \ echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + \ - ((c) > 255 ? 255 : (c)))' + ((c) > 255 ? 255 : (c)))'; \ + echo \#define LINUX_VERSION_MAJOR $(VERSION); \ + echo \#define LINUX_VERSION_PATCHLEVEL $(PATCHLEVEL); \ + echo \#define LINUX_VERSION_SUBLEVEL $(SUBLEVEL) endef $(version_h): FORCE diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ca6f2fc39ea0..29f886263dc5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -235,8 +235,8 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev) remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); snprintf(string + strlen(string), remaining_size, "%u.%u.%u", - (u8)((LINUX_VERSION_CODE >> 16) & 0xff), (u8)((LINUX_VERSION_CODE >> 8) & 0xff), - (u16)(LINUX_VERSION_CODE & 0xffff)); + LINUX_VERSION_MAJOR, LINUX_VERSION_PATCHLEVEL, + LINUX_VERSION_SUBLEVEL); /*Send the command*/ MLX5_SET(set_driver_version_in, in, opcode, diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index ad5a0f405a75..3f0381344221 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -111,8 +111,8 @@ DECLARE_WAIT_QUEUE_HEAD(usb_kill_urb_queue); */ /*-------------------------------------------------------------------------*/ -#define KERNEL_REL bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff)) -#define KERNEL_VER bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff)) +#define KERNEL_REL bin2bcd(LINUX_VERSION_MAJOR) +#define KERNEL_VER bin2bcd(LINUX_VERSION_PATCHLEVEL) /* usb 3.1 root hub device descriptor */ static const u8 usb31_rh_dev_descriptor[18] = { diff --git a/drivers/usb/gadget/udc/aspeed-vhub/hub.c b/drivers/usb/gadget/udc/aspeed-vhub/hub.c index bfd8e77788e2..5c7dea5e0ff1 100644 --- a/drivers/usb/gadget/udc/aspeed-vhub/hub.c +++ b/drivers/usb/gadget/udc/aspeed-vhub/hub.c @@ -46,8 +46,8 @@ * - Make vid/did overridable * - make it look like usb1 if usb1 mode forced */ -#define KERNEL_REL bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff)) -#define KERNEL_VER bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff)) +#define KERNEL_REL bin2bcd(LINUX_VERSION_MAJOR) +#define KERNEL_VER bin2bcd(LINUX_VERSION_PATCHLEVEL) enum { AST_VHUB_STR_INDEX_MAX = 4, diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index a2d229ab63ba..7531ce723374 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -573,8 +573,8 @@ static inline u16 get_default_bcdDevice(void) { u16 bcdDevice; - bcdDevice = bin2bcd((LINUX_VERSION_CODE >> 16 & 0xff)) << 8; - bcdDevice |= bin2bcd((LINUX_VERSION_CODE >> 8 & 0xff)); + bcdDevice = bin2bcd(LINUX_VERSION_MAJOR) << 8; + bcdDevice |= bin2bcd(LINUX_VERSION_PATCHLEVEL); return bcdDevice; } diff --git a/kernel/sys.c b/kernel/sys.c index 51f00fe20e4d..c2225bd405d5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1243,7 +1243,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; + v = LINUX_VERSION_PATCHLEVEL + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); From 585d32f9b0532ca2407943edec163c23191de488 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 12 Feb 2021 01:11:54 +0900 Subject: [PATCH 143/506] gen_compile_commands: prune some directories If directories are passed to gen_compile_commands.py, os.walk() traverses all the subdirectories to search for .cmd files, but we know some of them are not worth traversing. Use the 'topdown' parameter of os.walk to prune them. Documentation about the 'topdown' option of os.walk: When topdown is True, the caller can modify the dirnames list in-place (perhaps using del or slice assignment), and walk() will only recurse into the subdirectories whose names remain in dirnames; this can be used to prune the search, impose a specific order of visiting, or even to inform walk() about directories the caller creates or renames before it resumes walk() again. Modifying dirnames when topdown is False has no effect on the behavior of the walk, because in bottom-up mode the directories in dirnames are generated before dirpath itself is generated. This commit prunes four directories, .git, Documentation, include, and tools. The first three do not contain any C files, so skipping them makes this script work slightly faster. My main motivation is the last one, tools/ directory. Commit 6ca4c6d25949 ("gen_compile_commands: do not support .cmd files under tools/ directory") stopped supporting the tools/ directory. The current code no longer picks up .cmd files from the tools/ directory. If you run: ./scripts/clang-tools/gen_compile_commands.py --log_level=INFO then, you will see several "File ... not found" log messages. This is expected, and I do not want to support the tools/ directory. However, without an explicit comment "do not support tools/", somebody might try to get it back. Clarify this. Signed-off-by: Masahiro Yamada Acked-by: Nathan Chancellor --- scripts/clang-tools/gen_compile_commands.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/clang-tools/gen_compile_commands.py b/scripts/clang-tools/gen_compile_commands.py index 8ddb5d099029..b7e9ecf16e56 100755 --- a/scripts/clang-tools/gen_compile_commands.py +++ b/scripts/clang-tools/gen_compile_commands.py @@ -20,7 +20,9 @@ _DEFAULT_LOG_LEVEL = 'WARNING' _FILENAME_PATTERN = r'^\..*\.cmd$' _LINE_PATTERN = r'^cmd_[^ ]*\.o := (.* )([^ ]*\.c)$' _VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] - +# The tools/ directory adopts a different build system, and produces .cmd +# files in a different format. Do not support it. +_EXCLUDE_DIRS = ['.git', 'Documentation', 'include', 'tools'] def parse_arguments(): """Sets up and parses command-line arguments. @@ -80,8 +82,14 @@ def cmdfiles_in_dir(directory): """ filename_matcher = re.compile(_FILENAME_PATTERN) + exclude_dirs = [ os.path.join(directory, d) for d in _EXCLUDE_DIRS ] + + for dirpath, dirnames, filenames in os.walk(directory, topdown=True): + # Prune unwanted directories. + if dirpath in exclude_dirs: + dirnames[:] = [] + continue - for dirpath, _, filenames in os.walk(directory): for filename in filenames: if filename_matcher.match(filename): yield os.path.join(dirpath, filename) From 75cfb200cd081d23eb7eaa68deba9e0ab9320070 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 12 Feb 2021 16:49:47 -0500 Subject: [PATCH 144/506] NFS: 'flags' field should be unsigned in struct nfs_server The mount flags are all unsigned integers, so we should not be storing them in a signed field. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 38e60ec742df..962e8313f007 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -142,7 +142,7 @@ struct nfs_server { struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ atomic_long_t writeback; /* number of writeback pages */ - int flags; /* various flags */ + unsigned int flags; /* various flags */ /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 From ed7bcdb374d20fab9e9dc36853a6735c047ad1b1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 12 Feb 2021 16:49:48 -0500 Subject: [PATCH 145/506] NFS: Add support for eager writes Support eager writing to the server, meaning that we write the data to cache on the server, and wait for that to complete. This ensures that we see ENOSPC errors immediately. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 19 +++++++++++++++++-- fs/nfs/write.c | 17 ++++++++++++----- include/linux/nfs_fs_sb.h | 2 ++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 03fd1dcc96bd..16ad5050e046 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - unsigned long written = 0; - ssize_t result; + unsigned int mntflags = NFS_SERVER(inode)->flags; + ssize_t result, written; errseq_t since; int error; @@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + + if (mntflags & NFS_MOUNT_WRITE_EAGER) { + result = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } + if (mntflags & NFS_MOUNT_WRITE_WAIT) { + result = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } result = generic_write_sync(iocb, written); if (result < 0) goto out; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6193350356a8..82bdcb982186 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc; + struct nfs_io_completion *ioc = NULL; + unsigned int mntflags = NFS_SERVER(inode)->flags; + int priority = 0; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - ioc = nfs_io_completion_alloc(GFP_KERNEL); - if (ioc) - nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); + if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || + wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + ioc = nfs_io_completion_alloc(GFP_KERNEL); + if (ioc) + nfs_io_completion_init(ioc, nfs_io_completion_commit, + inode); + priority = wb_priority(wbc); + } - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 962e8313f007..6f76b32a0238 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -153,6 +153,8 @@ struct nfs_server { #define NFS_MOUNT_LOCAL_FCNTL 0x200000 #define NFS_MOUNT_SOFTERR 0x400000 #define NFS_MOUNT_SOFTREVAL 0x800000 +#define NFS_MOUNT_WRITE_EAGER 0x01000000 +#define NFS_MOUNT_WRITE_WAIT 0x02000000 unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ From a0492339fc70f1f7aa98f0cab55b78b0be124711 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 12 Feb 2021 16:49:49 -0500 Subject: [PATCH 146/506] NFS: Add mount options supporting eager writes Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/fs_context.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 06894bcdea2d..b6be02aa79f0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -82,6 +82,7 @@ enum nfs_param { Opt_v, Opt_vers, Opt_wsize, + Opt_write, }; enum { @@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = { {} }; +enum { + Opt_write_lazy, + Opt_write_eager, + Opt_write_wait, +}; + +static const struct constant_table nfs_param_enums_write[] = { + { "lazy", Opt_write_lazy }, + { "eager", Opt_write_eager }, + { "wait", Opt_write_wait }, + {} +}; + static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), @@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), + fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), {} }; @@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_invalid_value; } break; + case Opt_write: + switch (result.uint_32) { + case Opt_write_lazy: + ctx->flags &= + ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); + break; + case Opt_write_eager: + ctx->flags |= NFS_MOUNT_WRITE_EAGER; + ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; + break; + case Opt_write_wait: + ctx->flags |= + NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; + break; + default: + goto out_invalid_value; + } + break; /* * Special options From 6c17260ca4aeb17d11461647c6b7eefcc2602acc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 12 Feb 2021 16:41:19 -0500 Subject: [PATCH 147/506] NFS: Set the stable writes flag when initialising the super block We need to wait for outstanding writes on the page to complete before we can update it. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/fs_context.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index b6be02aa79f0..971a9251c1d9 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1512,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; + + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; From 7ae017c7322e2b12472033e65a48aa25cde2fb22 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 17 Feb 2021 10:12:33 -0500 Subject: [PATCH 148/506] NFS: Support the '-owrite=' option in /proc/self/mounts and mountinfo Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4034102010f0..bd22c9338600 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -511,6 +511,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, ",local_lock=flock"); else seq_puts(m, ",local_lock=posix"); + + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { + if (nfss->flags & NFS_MOUNT_WRITE_WAIT) + seq_puts(m, ",write=wait"); + else + seq_puts(m, ",write=eager"); + } } /* From 0a8ed2eaac102c746d8d114f2787f06cb3e55dfb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Feb 2021 16:34:09 -0800 Subject: [PATCH 149/506] KVM: SVM: Intercept INVPCID when it's disabled to inject #UD Intercept INVPCID if it's disabled in the guest, even when using NPT, as KVM needs to inject #UD in this case. Fixes: 4407a797e941 ("KVM: SVM: Enable INVPCID feature on AMD") Cc: Babu Moger Signed-off-by: Sean Christopherson Message-Id: <20210212003411.1102677-2-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index adb3619a3c16..f2f214dbac8a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1103,12 +1103,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static void svm_check_invpcid(struct vcpu_svm *svm) { /* - * Intercept INVPCID instruction only if shadow page table is - * enabled. Interception is not required with nested page table - * enabled. + * Intercept INVPCID if shadow paging is enabled to sync/free shadow + * roots, or if INVPCID is disabled in the guest to inject #UD. */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { - if (!npt_enabled) + if (!npt_enabled || + !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); From e42033342293212ba5329f04f15e81dcb29b7118 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Feb 2021 16:34:10 -0800 Subject: [PATCH 150/506] KVM: x86: Advertise INVPCID by default Advertise INVPCID by default (if supported by the host kernel) instead of having both SVM and VMX opt in. INVPCID was opt in when it was a VMX only feature so that KVM wouldn't prematurely advertise support if/when it showed up in the kernel on AMD hardware. Signed-off-by: Sean Christopherson Message-Id: <20210212003411.1102677-3-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/svm.c | 3 --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c8f2592ccc99..6bd2f8b830e4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -408,7 +408,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_7_0_EBX, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | + F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f2f214dbac8a..e4cc081ad7c1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -926,9 +926,6 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* Enable INVPCID feature */ - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e0a3a9be654b..6d265b2523f8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7330,8 +7330,8 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0x7 */ if (kvm_mpx_supported()) kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); - if (cpu_has_vmx_invpcid()) - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); + if (!cpu_has_vmx_invpcid()) + kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); From 1aaca37e1e4e3d098232ee9e3b154e83c52374ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Feb 2021 16:34:11 -0800 Subject: [PATCH 151/506] KVM: VMX: Allow INVPCID in guest without PCID Remove the restriction that prevents VMX from exposing INVPCID to the guest without PCID also being exposed to the guest. The justification of the restriction is that INVPCID will #UD if it's disabled in the VMCS. While that is a true statement, it's also true that RDTSCP will #UD if it's disabled in the VMCS. Neither of those things has any dependency whatsoever on the guest being able to set CR4.PCIDE=1, which is what is effectively allowed by exposing PCID to the guest. Removing the bogus restriction aligns VMX with SVM, and also allows for an interesting configuration. INVPCID is that fastest way to do a global TLB flush, e.g. see native_flush_tlb_global(). Allowing INVPCID without PCID would let a guest use the expedited flush while also limiting the number of ASIDs consumed by the guest. Signed-off-by: Sean Christopherson Message-Id: <20210212003411.1102677-4-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6d265b2523f8..e1b84008a05d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4295,18 +4295,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); - - /* - * Expose INVPCID if and only if PCID is also exposed to the guest. - * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF - * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect - * behavior from the guest perspective (it would expect #GP or #PF). - */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); From f5c59b575bdfb6522a99e964875e39bd2568657b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 17 Feb 2021 16:57:12 +0200 Subject: [PATCH 152/506] KVM: VMX: read idt_vectoring_info a bit earlier trace_kvm_exit prints this value (using vmx_get_exit_info) so it makes sense to read it before the trace point. Fixes: dcf068da7eb2 ("KVM: VMX: Introduce generic fastpath handler") Signed-off-by: Maxim Levitsky Message-Id: <20210217145718.1217358-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e1b84008a05d..5ddaf7e4f601 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6841,13 +6841,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); + if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); From 954f419ba8ad6b636ae46b24aaa6a91512df5da8 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 17 Feb 2021 16:57:13 +0200 Subject: [PATCH 153/506] KVM: nSVM: move nested vmrun tracepoint to enter_svm_guest_mode This way trace will capture all the nested mode entries (including entries after migration, and from smm) Signed-off-by: Maxim Levitsky Message-Id: <20210217145718.1217358-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cc91738ab445..bcf466d2d807 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -436,6 +436,20 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, { int ret; + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); + + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); + + svm->nested.vmcb12_gpa = vmcb12_gpa; load_nested_vmcb_control(svm, &vmcb12->control); nested_prepare_vmcb_save(svm, vmcb12); @@ -489,18 +503,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm) goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, - vmcb12->save.rip, - vmcb12->control.int_ctl, - vmcb12->control.event_inj, - vmcb12->control.nested_ctl); - - trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, - vmcb12->control.intercepts[INTERCEPT_CR] >> 16, - vmcb12->control.intercepts[INTERCEPT_EXCEPTION], - vmcb12->control.intercepts[INTERCEPT_WORD3], - vmcb12->control.intercepts[INTERCEPT_WORD4], - vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); From a04aead144fd938c2d9869eb187e5b9ea0009bae Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Feb 2021 07:16:59 -0500 Subject: [PATCH 154/506] KVM: nSVM: fix running nested guests when npt=0 In case of npt=0 on host, nSVM needs the same .inject_page_fault tweak as VMX has, to make sure that shadow mmu faults are injected as vmexits. It is not clear why this is needed at all, but for now keep the same code as VMX and we'll fix it for both. Based on a patch by Maxim Levitsky . Fixes: 7c86663b68ba ("KVM: nSVM: inject exceptions via svm_check_nested_events") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bcf466d2d807..92d3aaaac612 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } +static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) +{ + struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(!is_guest_mode(vcpu)); + + if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && + !svm->nested.nested_run_pending) { + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = fault->error_code; + svm->vmcb->control.exit_info_2 = fault->address; + nested_svm_vmexit(svm); + } else { + kvm_inject_page_fault(vcpu, fault); + } +} + static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); @@ -460,6 +477,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, if (ret) return ret; + if (!npt_enabled) + svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + svm_set_gif(svm, true); return 0; From 78e550bad2984a524d8a71ba8feed366b29436ef Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Feb 2021 07:19:55 -0500 Subject: [PATCH 155/506] KVM: nVMX: no need to undo inject_page_fault change on nested vmexit This is not needed because the tweak was done on the guest_mmu, while nested_ept_uninit_mmu_context has just changed vcpu->arch.walk_mmu back to the root_mmu. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b2f0b5e9cd63..066156ff9c7c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4200,9 +4200,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); From b1b805f3cb22efe3ec48703699ca7181b9f3785b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 20 Jan 2021 09:59:22 -0800 Subject: [PATCH 156/506] dt-bindings: dp-connector: Drop maxItems from -supply The meta-schema recently gained a definition for the common -supply$ property, which denotes that maxItems is not a valid property. Drop this to clear up the binding validation error. Fixes: a46c112512de ("dt-bindings: dp-connector: add binding for DisplayPort connector") Signed-off-by: Bjorn Andersson Signed-off-by: Rob Herring (cherry picked from commit ca230ab18791f418faac19d00f3c35e2dbc25dfe) Link: https://patchwork.freedesktop.org/patch/msgid/20210120175922.1579835-1-bjorn.andersson@linaro.org --- .../devicetree/bindings/display/connector/dp-connector.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/connector/dp-connector.yaml b/Documentation/devicetree/bindings/display/connector/dp-connector.yaml index 1c17d60e7760..22792a79e7ce 100644 --- a/Documentation/devicetree/bindings/display/connector/dp-connector.yaml +++ b/Documentation/devicetree/bindings/display/connector/dp-connector.yaml @@ -26,7 +26,6 @@ properties: dp-pwr-supply: description: Power supply for the DP_PWR pin - maxItems: 1 port: $ref: /schemas/graph.yaml#/properties/port From 92c6058024e87087cf1b99b0389d67c0a886360e Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Thu, 10 Sep 2020 07:57:04 +0000 Subject: [PATCH 157/506] i40e: Fix flow for IPv6 next header (extension header) When a packet contains an IPv6 header with next header which is an extension header and not a protocol one, the kernel function skb_transport_header called with such sk_buff will return a pointer to the extension header and not to the TCP one. The above explained call caused a problem with packet processing for skb with encapsulation for tunnel with I40E_TX_CTX_EXT_IP_IPV6. The extension header was not skipped at all. The ipv6_skip_exthdr function does check if next header of the IPV6 header is an extension header and doesn't modify the l4_proto pointer if it points to a protocol header value so its safe to omit the comparison of exthdr and l4.hdr pointers. The ipv6_skip_exthdr can return value -1. This means that the skipping process failed and there is something wrong with the packet so it will be dropped. Fixes: a3fd9d8876a5 ("i40e/i40evf: Handle IPv6 extension headers in checksum offload") Signed-off-by: Slawomir Laba Signed-off-by: Przemyslaw Patynowski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 4aca637d4a23..32d97315f3f5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3113,13 +3113,16 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags, l4_proto = ip.v4->protocol; } else if (*tx_flags & I40E_TX_FLAGS_IPV6) { + int ret; + tunnel |= I40E_TX_CTX_EXT_IP_IPV6; exthdr = ip.hdr + sizeof(*ip.v6); l4_proto = ip.v6->nexthdr; - if (l4.hdr != exthdr) - ipv6_skip_exthdr(skb, exthdr - skb->data, - &l4_proto, &frag_off); + ret = ipv6_skip_exthdr(skb, exthdr - skb->data, + &l4_proto, &frag_off); + if (ret < 0) + return -1; } /* define outer transport */ From 58cab46c622d6324e47bd1c533693c94498e4172 Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Fri, 30 Oct 2020 07:14:30 +0000 Subject: [PATCH 158/506] i40e: Fix memory leak in i40e_probe Struct i40e_veb is allocated in function i40e_setup_pf_switch, and stored to an array field veb inside struct i40e_pf. However when i40e_setup_misc_vector fails, this memory leaks. Fix this by calling exit and teardown functions. Signed-off-by: Keita Suzuki Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1db482d310c2..84916261f5df 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15122,6 +15122,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) { dev_info(&pdev->dev, "setup of misc vector failed: %d\n", err); + i40e_cloud_filter_exit(pf); + i40e_fdir_teardown(pf); goto err_vsis; } } From d2c788f739b6f68090e968a2ee31b543701e795f Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Fri, 20 Nov 2020 10:35:37 +0000 Subject: [PATCH 159/506] i40e: Add zero-initialization of AQ command structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero-initialize AQ command data structures to comply with API specifications. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Fixes: f4492db16df8 ("i40e: Add NPAR BW get and set functions") Signed-off-by: Andrzej Sawuła Signed-off-by: Mateusz Palczewski Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 84916261f5df..90c6c991aebc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7667,6 +7667,8 @@ int i40e_add_del_cloud_filter(struct i40e_vsi *vsi, if (filter->flags >= ARRAY_SIZE(flag_table)) return I40E_ERR_CONFIG; + memset(&cld_filter, 0, sizeof(cld_filter)); + /* copy element needed to add cloud filter from filter */ i40e_set_cld_element(filter, &cld_filter); @@ -7734,6 +7736,8 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, !ipv6_addr_any(&filter->ip.v6.src_ip6)) return -EOPNOTSUPP; + memset(&cld_filter, 0, sizeof(cld_filter)); + /* copy element needed to add cloud filter from filter */ i40e_set_cld_element(filter, &cld_filter.element); @@ -11709,6 +11713,8 @@ i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf) struct i40e_aqc_configure_partition_bw_data bw_data; i40e_status status; + memset(&bw_data, 0, sizeof(bw_data)); + /* Set the valid bit for this PF */ bw_data.pf_valid_bits = cpu_to_le16(BIT(pf->hw.pf_id)); bw_data.max_bw[pf->hw.pf_id] = pf->max_bw & I40E_ALT_BW_VALUE_MASK; From 4cdb9f80dcd46aab3c0020b4a6920c22735c5d6e Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Tue, 24 Nov 2020 15:08:27 +0000 Subject: [PATCH 160/506] i40e: Fix overwriting flow control settings during driver loading During driver loading flow control settings were written to FW using a variable which was always zero, since it was being set only by ethtool. This behavior has been corrected and driver no longer overwrites the default FW/NVM settings. Fixes: 373149fc99a0 ("i40e: Decrease the scope of rtnl lock") Signed-off-by: Dawid Lukwinski Signed-off-by: Mateusz Palczewski Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 27 --------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 90c6c991aebc..53efb3a53df2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10005,7 +10005,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) int old_recovery_mode_bit = test_bit(__I40E_RECOVERY_MODE, pf->state); struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi]; struct i40e_hw *hw = &pf->hw; - u8 set_fc_aq_fail = 0; i40e_status ret; u32 val; int v; @@ -10131,13 +10130,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) i40e_stat_str(&pf->hw, ret), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); - /* make sure our flow control settings are restored */ - ret = i40e_set_fc(&pf->hw, &set_fc_aq_fail, true); - if (ret) - dev_dbg(&pf->pdev->dev, "setting flow control: ret = %s last_status = %s\n", - i40e_stat_str(&pf->hw, ret), - i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); - /* Rebuild the VSIs and VEBs that existed before reset. * They are still in our local switch element arrays, so only * need to rebuild the switch model in the HW. @@ -14720,7 +14712,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) int err; u32 val; u32 i; - u8 set_fc_aq_fail; err = pci_enable_device_mem(pdev); if (err) @@ -15054,24 +15045,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } INIT_LIST_HEAD(&pf->vsi[pf->lan_vsi]->ch_list); - /* Make sure flow control is set according to current settings */ - err = i40e_set_fc(hw, &set_fc_aq_fail, true); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_GET) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on get_phy_cap\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_SET) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on set_phy_config\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_UPDATE) - dev_dbg(&pf->pdev->dev, - "Set fc with err %s aq_err %s on get_link_info\n", - i40e_stat_str(hw, err), - i40e_aq_str(hw, hw->aq.asq_last_status)); - /* if FDIR VSI was set up, start it now */ for (i = 0; i < pf->num_alloc_vsi; i++) { if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR) { From 28b1208e7a7fa3ddc9345b022bb93e53d9dcc28a Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Fri, 27 Nov 2020 10:39:03 +0000 Subject: [PATCH 161/506] i40e: Fix addition of RX filters after enabling FW LLDP agent Fix addition of VLAN filter for PF after enabling FW LLDP agent. Changing LLDP Agent causes FW to re-initialize per NVM settings. Remove default PF filter and move "Enable/Disable" to currently used reset flag. Without this patch PF would try to add MAC VLAN filter with default switch filter present. This causes AQ error and sets promiscuous mode on. Fixes: c65e78f87f81 ("i40e: Further implementation of LLDP") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Mateusz Palczewski Reviewed-by: Sylwester Dziedziuch Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 16 +++++++++------- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 26ba1f3eb2d8..9e81f85ee2d8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -4878,7 +4878,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) enum i40e_admin_queue_err adq_err; struct i40e_vsi *vsi = np->vsi; struct i40e_pf *pf = vsi->back; - bool is_reset_needed; + u32 reset_needed = 0; i40e_status status; u32 i, j; @@ -4923,9 +4923,11 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) flags_complete: changed_flags = orig_flags ^ new_flags; - is_reset_needed = !!(changed_flags & (I40E_FLAG_VEB_STATS_ENABLED | - I40E_FLAG_LEGACY_RX | I40E_FLAG_SOURCE_PRUNING_DISABLED | - I40E_FLAG_DISABLE_FW_LLDP)); + if (changed_flags & I40E_FLAG_DISABLE_FW_LLDP) + reset_needed = I40E_PF_RESET_AND_REBUILD_FLAG; + if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED | + I40E_FLAG_LEGACY_RX | I40E_FLAG_SOURCE_PRUNING_DISABLED)) + reset_needed = BIT(__I40E_PF_RESET_REQUESTED); /* Before we finalize any flag changes, we need to perform some * checks to ensure that the changes are supported and safe. @@ -5057,7 +5059,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) case I40E_AQ_RC_EEXIST: dev_warn(&pf->pdev->dev, "FW LLDP agent is already running\n"); - is_reset_needed = false; + reset_needed = 0; break; case I40E_AQ_RC_EPERM: dev_warn(&pf->pdev->dev, @@ -5086,8 +5088,8 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) /* Issue reset to cause things to take effect, as additional bits * are added we will need to create a mask of bits requiring reset */ - if (is_reset_needed) - i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true); + if (reset_needed) + i40e_do_reset(pf, reset_needed, true); return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53efb3a53df2..3505d641660b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8537,11 +8537,6 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired) dev_dbg(&pf->pdev->dev, "PFR requested\n"); i40e_handle_reset_warning(pf, lock_acquired); - dev_info(&pf->pdev->dev, - pf->flags & I40E_FLAG_DISABLE_FW_LLDP ? - "FW LLDP is disabled\n" : - "FW LLDP is enabled\n"); - } else if (reset_flags & I40E_PF_RESET_AND_REBUILD_FLAG) { /* Request a PF Reset * @@ -8549,6 +8544,10 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired) */ i40e_prep_for_reset(pf, lock_acquired); i40e_reset_and_rebuild(pf, true, lock_acquired); + dev_info(&pf->pdev->dev, + pf->flags & I40E_FLAG_DISABLE_FW_LLDP ? + "FW LLDP is disabled\n" : + "FW LLDP is enabled\n"); } else if (reset_flags & BIT_ULL(__I40E_REINIT_REQUESTED)) { int v; From dc8812626440fa6a27f1f3f654f6dc435e042e42 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Fri, 27 Nov 2020 11:23:01 +0000 Subject: [PATCH 162/506] i40e: Fix VFs not created When creating VFs they were sometimes not getting resources. It was caused by not executing i40e_reset_all_vfs due to flag __I40E_VF_DISABLE being set on PF. Because of this IAVF was never able to finish setup sequence never getting reset indication from PF. Changed test_and_set_bit __I40E_VF_DISABLE in i40e_sync_filters_subtask to test_bit and removed clear_bit. This function should not set this bit it should only check if it hasn't been already set. Fixes: a7542b876075 ("i40e: check __I40E_VF_DISABLE bit in i40e_sync_filters_subtask") Signed-off-by: Sylwester Dziedziuch Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3505d641660b..2e22ab5a0f9a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2616,7 +2616,7 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) return; if (!test_and_clear_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state)) return; - if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) { + if (test_bit(__I40E_VF_DISABLE, pf->state)) { set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state); return; } @@ -2634,7 +2634,6 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) } } } - clear_bit(__I40E_VF_DISABLE, pf->state); } /** From f105aa940e78a87b6b6c82d7c230db86386ff013 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Sat, 16 Jan 2021 01:49:48 +0200 Subject: [PATCH 163/506] riscv: add BUILTIN_DTB support for MMU-enabled targets Sometimes, especially in a production system we may not want to use a "smart bootloader" like u-boot to load kernel, ramdisk and device tree from a filesystem on eMMC, but rather load the kernel from a NAND partition and just run it as soon as we can, and in this case it is convenient to have device tree compiled into the kernel binary. Since this case is not limited to MMU-less systems, let's support it for these which have MMU enabled too. While at it, provide __dtb_start as a parameter to setup_vm() in BUILTIN_DTB case, so we don't have to duplicate BUILTIN_DTB specific processing in MMU-enabled and MMU-disabled versions of setup_vm(). Signed-off-by: Vitaly Wool Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 - arch/riscv/kernel/head.S | 4 ++++ arch/riscv/mm/init.c | 19 +++++++++++++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c8d45da8edd9..8eadd1cbd524 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -460,7 +460,6 @@ endmenu config BUILTIN_DTB def_bool n - depends on RISCV_M_MODE depends on OF menu "Power management options" diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 16e9941900c4..f5a9bad86e58 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -260,7 +260,11 @@ clear_bss_done: /* Initialize page tables and relocate to virtual addresses */ la sp, init_thread_union + THREAD_SIZE +#ifdef CONFIG_BUILTIN_DTB + la a0, __dtb_start +#else mv a0, s1 +#endif /* CONFIG_BUILTIN_DTB */ call setup_vm #ifdef CONFIG_MMU la a0, early_pg_dir diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 30b61f2c6b87..45faad7c4291 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -192,10 +192,13 @@ void __init setup_bootmem(void) #endif /* CONFIG_BLK_DEV_INITRD */ /* - * Avoid using early_init_fdt_reserve_self() since __pa() does + * If DTB is built in, no need to reserve its memblock. + * Otherwise, do reserve it but avoid using + * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); @@ -499,6 +502,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Setup early PMD for DTB */ create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, (uintptr_t)early_dtb_pmd, PGDIR_SIZE, PAGE_TABLE); +#ifndef CONFIG_BUILTIN_DTB /* Create two consecutive PMD mappings for FDT early scan */ pa = dtb_pa & ~(PMD_SIZE - 1); create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, @@ -506,7 +510,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); +#else /* CONFIG_BUILTIN_DTB */ + dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_BUILTIN_DTB */ #else +#ifndef CONFIG_BUILTIN_DTB /* Create two consecutive PGD mappings for FDT early scan */ pa = dtb_pa & ~(PGDIR_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, @@ -514,6 +522,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); +#else /* CONFIG_BUILTIN_DTB */ + dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_BUILTIN_DTB */ #endif dtb_early_pa = dtb_pa; @@ -604,11 +615,7 @@ static void __init setup_vm_final(void) #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) { -#ifdef CONFIG_BUILTIN_DTB - dtb_early_va = (void *) __dtb_start; -#else dtb_early_va = (void *)dtb_pa; -#endif dtb_early_pa = dtb_pa; } From fade5cad9339a627c5ad029e3577582b6292df03 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 15 Jan 2021 13:46:03 +0800 Subject: [PATCH 164/506] initrd: Add the preprocessor guard in initrd.h Add the preprocessor guard in initrd.h to prevent possible build error from the multiple inclusion of same header file multiple time. Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- include/linux/initrd.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 8db6f8c8030b..fc30ac30e10e 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_INITRD_H +#define __LINUX_INITRD_H + #define INITRD_MINOR 250 /* shouldn't collide with /dev/ram* too soon ... */ /* starting block # of image */ @@ -24,3 +27,5 @@ extern char __initramfs_start[]; extern unsigned long __initramfs_size; void console_on_rootfs(void); + +#endif /* __LINUX_INITRD_H */ From c72160fe05fb978ad859ba053c4462c2bb960b13 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 15 Jan 2021 13:46:04 +0800 Subject: [PATCH 165/506] initramfs: Provide a common initrd reserve function Some architectures(eg, ARM and riscv) have similar logic to check and reserve the memory of initrd, let's provide a common function reserve_initrd_mem() to reduce duplicated code. Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- include/linux/initrd.h | 6 ++++++ init/initramfs.c | 45 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/include/linux/initrd.h b/include/linux/initrd.h index fc30ac30e10e..85c15717af34 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -18,6 +18,12 @@ extern int initrd_below_start_ok; extern unsigned long initrd_start, initrd_end; extern void free_initrd_mem(unsigned long, unsigned long); +#ifdef CONFIG_BLK_DEV_INITRD +extern void __init reserve_initrd_mem(void); +#else +static inline void __init reserve_initrd_mem(void) {} +#endif + extern phys_addr_t phys_initrd_start; extern unsigned long phys_initrd_size; diff --git a/init/initramfs.c b/init/initramfs.c index 55b74d7e5260..f75c89e9d602 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -535,6 +535,51 @@ extern unsigned long __initramfs_size; #include #include +void __init reserve_initrd_mem(void) +{ + phys_addr_t start; + unsigned long size; + + /* Ignore the virtul address computed during device tree parsing */ + initrd_start = initrd_end = 0; + + if (!phys_initrd_size) + return; + /* + * Round the memory region to page boundaries as per free_initrd_mem() + * This allows us to detect whether the pages overlapping the initrd + * are in use, but more importantly, reserves the entire set of pages + * as we don't want these pages allocated for other purposes. + */ + start = round_down(phys_initrd_start, PAGE_SIZE); + size = phys_initrd_size + (phys_initrd_start - start); + size = round_up(size, PAGE_SIZE); + + if (!memblock_is_region_memory(start, size)) { + pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", + (u64)start, size); + goto disable; + } + + if (memblock_is_region_reserved(start, size)) { + pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", + (u64)start, size); + goto disable; + } + + memblock_reserve(start, size); + /* Now convert initrd to virtual addresses */ + initrd_start = (unsigned long)__va(phys_initrd_start); + initrd_end = initrd_start + phys_initrd_size; + initrd_below_start_ok = 1; + + return; +disable: + pr_cont(" - disabling initrd\n"); + initrd_start = 0; + initrd_end = 0; +} + void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { #ifdef CONFIG_ARCH_KEEP_MEMBLOCK From aec33b54af55ef025e03e3dfbab3b8abe00eaa22 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 15 Jan 2021 13:46:06 +0800 Subject: [PATCH 166/506] riscv: Covert to reserve_initrd_mem() Covert to the generic reserve_initrd_mem() function. Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 54 +------------------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 45faad7c4291..c9bbab76358b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -106,55 +106,6 @@ void __init mem_init(void) print_vm_layout(); } -#ifdef CONFIG_BLK_DEV_INITRD -static void __init setup_initrd(void) -{ - phys_addr_t start; - unsigned long size; - - /* Ignore the virtul address computed during device tree parsing */ - initrd_start = initrd_end = 0; - - if (!phys_initrd_size) - return; - /* - * Round the memory region to page boundaries as per free_initrd_mem() - * This allows us to detect whether the pages overlapping the initrd - * are in use, but more importantly, reserves the entire set of pages - * as we don't want these pages allocated for other purposes. - */ - start = round_down(phys_initrd_start, PAGE_SIZE); - size = phys_initrd_size + (phys_initrd_start - start); - size = round_up(size, PAGE_SIZE); - - if (!memblock_is_region_memory(start, size)) { - pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", - (u64)start, size); - goto disable; - } - - if (memblock_is_region_reserved(start, size)) { - pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", - (u64)start, size); - goto disable; - } - - memblock_reserve(start, size); - /* Now convert initrd to virtual addresses */ - initrd_start = (unsigned long)__va(phys_initrd_start); - initrd_end = initrd_start + phys_initrd_size; - initrd_below_start_ok = 1; - - pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", - (void *)(initrd_start), size); - return; -disable: - pr_cont(" - disabling initrd\n"); - initrd_start = 0; - initrd_end = 0; -} -#endif /* CONFIG_BLK_DEV_INITRD */ - void __init setup_bootmem(void) { phys_addr_t mem_start = 0; @@ -187,10 +138,7 @@ void __init setup_bootmem(void) dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn); -#ifdef CONFIG_BLK_DEV_INITRD - setup_initrd(); -#endif /* CONFIG_BLK_DEV_INITRD */ - + reserve_initrd_mem(); /* * If DTB is built in, no need to reserve its memblock. * Otherwise, do reserve it but avoid using From e178d670f251b6947d6be99c0014e9a57ad4f0e0 Mon Sep 17 00:00:00 2001 From: Nylon Chen Date: Sat, 16 Jan 2021 13:58:35 +0800 Subject: [PATCH 167/506] riscv/kasan: add KASAN_VMALLOC support It references to x86/s390 architecture. So, it doesn't map the early shadow page to cover VMALLOC space. Prepopulate top level page table for the range that would otherwise be empty. lower levels are filled dynamically upon memory allocation while booting. Signed-off-by: Nylon Chen Signed-off-by: Nick Hu Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/mm/kasan_init.c | 57 +++++++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8eadd1cbd524..3832a537c5d6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -57,6 +57,7 @@ config RISCV select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if MMU && 64BIT + select HAVE_ARCH_KASAN_VMALLOC if MMU && 64BIT select HAVE_ARCH_KGDB select HAVE_ARCH_KGDB_QXFER_PKT select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 12ddd1f6bf70..4b9149f963d3 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -9,6 +9,19 @@ #include #include #include +#include + +static __init void *early_alloc(size_t size, int node) +{ + void *ptr = memblock_alloc_try_nid(size, size, + __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node); + + if (!ptr) + panic("%pS: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", + __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); + + return ptr; +} extern pgd_t early_pg_dir[PTRS_PER_PGD]; asmlinkage void __init kasan_early_init(void) @@ -83,6 +96,40 @@ static void __init populate(void *start, void *end) memset(start, 0, end - start); } +void __init kasan_shallow_populate(void *start, void *end) +{ + unsigned long vaddr = (unsigned long)start & PAGE_MASK; + unsigned long vend = PAGE_ALIGN((unsigned long)end); + unsigned long pfn; + int index; + void *p; + pud_t *pud_dir, *pud_k; + pgd_t *pgd_dir, *pgd_k; + p4d_t *p4d_dir, *p4d_k; + + while (vaddr < vend) { + index = pgd_index(vaddr); + pfn = csr_read(CSR_SATP) & SATP_PPN; + pgd_dir = (pgd_t *)pfn_to_virt(pfn) + index; + pgd_k = init_mm.pgd + index; + pgd_dir = pgd_offset_k(vaddr); + set_pgd(pgd_dir, *pgd_k); + + p4d_dir = p4d_offset(pgd_dir, vaddr); + p4d_k = p4d_offset(pgd_k, vaddr); + + vaddr = (vaddr + PUD_SIZE) & PUD_MASK; + pud_dir = pud_offset(p4d_dir, vaddr); + pud_k = pud_offset(p4d_k, vaddr); + + if (pud_present(*pud_dir)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + pud_populate(&init_mm, pud_dir, p); + } + vaddr += PAGE_SIZE; + } +} + void __init kasan_init(void) { phys_addr_t _start, _end; @@ -90,7 +137,15 @@ void __init kasan_init(void) kasan_populate_early_shadow((void *)KASAN_SHADOW_START, (void *)kasan_mem_to_shadow((void *) - VMALLOC_END)); + VMEMMAP_END)); + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_shallow_populate( + (void *)kasan_mem_to_shadow((void *)VMALLOC_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + else + kasan_populate_early_shadow( + (void *)kasan_mem_to_shadow((void *)VMALLOC_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); for_each_mem_range(i, &_start, &_end) { void *start = (void *)_start; From 5da9cbd2b200369fd190c81bc1253e9a17ab3e8d Mon Sep 17 00:00:00 2001 From: tangchunyou Date: Thu, 21 Jan 2021 09:55:13 +0800 Subject: [PATCH 168/506] arch/riscv:fix typo in a comment in arch/riscv/kernel/image-vars.h "kerne" -> "kernel" Signed-off-by: WenZhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/image-vars.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h index 8c212efb37a6..71a76a623257 100644 --- a/arch/riscv/kernel/image-vars.h +++ b/arch/riscv/kernel/image-vars.h @@ -3,7 +3,7 @@ * Copyright (C) 2020 Western Digital Corporation or its affiliates. * Linker script variables to be set after section resolution, as * ld.lld does not like variables assigned before SECTIONS is processed. - * Based on arch/arm64/kerne/image-vars.h + * Based on arch/arm64/kernel/image-vars.h */ #ifndef __RISCV_KERNEL_IMAGE_VARS_H #define __RISCV_KERNEL_IMAGE_VARS_H From d4c34d09ab03e1e631fe195ddf35365a1273be9c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Jan 2021 09:58:40 +0900 Subject: [PATCH 169/506] pinctrl: Add RISC-V Canaan Kendryte K210 FPIOA driver Add the pinctrl-k210.c pinctrl driver for the Canaan Kendryte K210 field programmable IO array (FPIOA) to allow configuring the SoC pin functions. The K210 has 48 programmable pins which can take any of 256 possible functions. This patch is inspired from the k210 pinctrl driver for the u-boot project and contains many direct contributions from Sean Anderson. The MAINTAINERS file is updated, adding the entry "CANAAN/KENDRYTE K210 SOC FPIOA DRIVER" with myself listed as maintainer for this driver. Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Signed-off-by: Sean Anderson Signed-off-by: Damien Le Moal Reviewed-by: Linus Walleij Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 7 + arch/riscv/Kconfig.socs | 1 + drivers/pinctrl/Kconfig | 13 + drivers/pinctrl/Makefile | 1 + drivers/pinctrl/pinctrl-k210.c | 985 +++++++++++++++++++++++++++++++++ 5 files changed, 1007 insertions(+) create mode 100644 drivers/pinctrl/pinctrl-k210.c diff --git a/MAINTAINERS b/MAINTAINERS index 380a446d4d4d..54f356e67491 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3860,6 +3860,13 @@ W: https://github.com/Cascoda/ca8210-linux.git F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c +CANAAN/KENDRYTE K210 SOC FPIOA DRIVER +M: Damien Le Moal +L: linux-riscv@lists.infradead.org +L: linux-gpio@vger.kernel.org (pinctrl driver) +F: Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml +F: drivers/pinctrl/pinctrl-k210.c + CANAAN/KENDRYTE K210 SOC RESET CONTROLLER DRIVER M: Damien Le Moal L: linux-kernel@vger.kernel.org diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 57e53219c500..6402746c68f3 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -30,6 +30,7 @@ config SOC_CANAAN select SERIAL_SIFIVE_CONSOLE if TTY select SIFIVE_PLIC select ARCH_HAS_RESET_CONTROLLER + select PINCTRL help This enables support for Canaan Kendryte K210 SoC platform hardware. diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index d4b2f2e2ed75..cd437e3cc255 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -394,6 +394,19 @@ config PINCTRL_MICROCHIP_SGPIO connect control signals from SFP modules and to act as an LED controller. +config PINCTRL_K210 + bool "Pinctrl driver for the Canaan Kendryte K210 SoC" + depends on RISCV && SOC_CANAAN && OF + select GENERIC_PINMUX_FUNCTIONS + select GENERIC_PINCONF + select GPIOLIB + select OF_GPIO + select REGMAP_MMIO + default SOC_CANAAN + help + Add support for the Canaan Kendryte K210 RISC-V SOC Field + Programmable IO Array (FPIOA) controller. + source "drivers/pinctrl/actions/Kconfig" source "drivers/pinctrl/aspeed/Kconfig" source "drivers/pinctrl/bcm/Kconfig" diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 5bb9bb6cc3ce..152c8fe51726 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_PINCTRL_RK805) += pinctrl-rk805.o obj-$(CONFIG_PINCTRL_OCELOT) += pinctrl-ocelot.o obj-$(CONFIG_PINCTRL_MICROCHIP_SGPIO) += pinctrl-microchip-sgpio.o obj-$(CONFIG_PINCTRL_EQUILIBRIUM) += pinctrl-equilibrium.o +obj-$(CONFIG_PINCTRL_K210) += pinctrl-k210.o obj-y += actions/ obj-$(CONFIG_ARCH_ASPEED) += aspeed/ diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c new file mode 100644 index 000000000000..8a733cf77ba0 --- /dev/null +++ b/drivers/pinctrl/pinctrl-k210.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Sean Anderson + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "core.h" +#include "pinconf.h" +#include "pinctrl-utils.h" + +/* + * The K210 only implements 8 drive levels, even though + * there is register space for 16 + */ +#define K210_PC_DRIVE_MASK GENMASK(11, 8) +#define K210_PC_DRIVE_SHIFT 8 +#define K210_PC_DRIVE_0 (0 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_1 (1 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_2 (2 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_3 (3 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_4 (4 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_5 (5 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_6 (6 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_7 (7 << K210_PC_DRIVE_SHIFT) +#define K210_PC_DRIVE_MAX 7 +#define K210_PC_MODE_MASK GENMASK(23, 12) + +/* + * output enabled == PC_OE & (PC_OE_INV ^ FUNCTION_OE) + * where FUNCTION_OE is a physical signal from the function. + */ +#define K210_PC_OE BIT(12) /* Output Enable */ +#define K210_PC_OE_INV BIT(13) /* INVert Output Enable */ +#define K210_PC_DO_OE BIT(14) /* set Data Out to Output Enable sig */ +#define K210_PC_DO_INV BIT(15) /* INVert final Data Output */ +#define K210_PC_PU BIT(16) /* Pull Up */ +#define K210_PC_PD BIT(17) /* Pull Down */ +/* Strong pull up not implemented on K210 */ +#define K210_PC_SL BIT(19) /* reduce SLew rate */ +/* Same semantics as OE above */ +#define K210_PC_IE BIT(20) /* Input Enable */ +#define K210_PC_IE_INV BIT(21) /* INVert Input Enable */ +#define K210_PC_DI_INV BIT(22) /* INVert Data Input */ +#define K210_PC_ST BIT(23) /* Schmitt Trigger */ +#define K210_PC_DI BIT(31) /* raw Data Input */ + +#define K210_PC_BIAS_MASK (K210_PC_PU & K210_PC_PD) + +#define K210_PC_MODE_IN (K210_PC_IE | K210_PC_ST) +#define K210_PC_MODE_OUT (K210_PC_DRIVE_7 | K210_PC_OE) +#define K210_PC_MODE_I2C (K210_PC_MODE_IN | K210_PC_SL | \ + K210_PC_OE | K210_PC_PU) +#define K210_PC_MODE_SCCB (K210_PC_MODE_I2C | \ + K210_PC_OE_INV | K210_PC_IE_INV) +#define K210_PC_MODE_SPI (K210_PC_MODE_IN | K210_PC_IE_INV | \ + K210_PC_MODE_OUT | K210_PC_OE_INV) +#define K210_PC_MODE_GPIO (K210_PC_MODE_IN | K210_PC_MODE_OUT) + +#define K210_PG_FUNC GENMASK(7, 0) +#define K210_PG_DO BIT(8) +#define K210_PG_PIN GENMASK(22, 16) + +/* + * struct k210_fpioa: Kendryte K210 FPIOA memory mapped registers + * @pins: 48 32-bits IO pin registers + * @tie_en: 256 (one per function) input tie enable bits + * @tie_val: 256 (one per function) input tie value bits + */ +struct k210_fpioa { + u32 pins[48]; + u32 tie_en[8]; + u32 tie_val[8]; +}; + +struct k210_fpioa_data { + + struct device *dev; + struct pinctrl_dev *pctl; + + struct k210_fpioa __iomem *fpioa; + struct regmap *sysctl_map; + u32 power_offset; + struct clk *clk; + struct clk *pclk; +}; + +#define K210_PIN_NAME(i) ("IO_" #i) +#define K210_PIN(i) [(i)] = PINCTRL_PIN((i), K210_PIN_NAME(i)) + +static const struct pinctrl_pin_desc k210_pins[] = { + K210_PIN(0), K210_PIN(1), K210_PIN(2), + K210_PIN(3), K210_PIN(4), K210_PIN(5), + K210_PIN(6), K210_PIN(7), K210_PIN(8), + K210_PIN(9), K210_PIN(10), K210_PIN(11), + K210_PIN(12), K210_PIN(13), K210_PIN(14), + K210_PIN(15), K210_PIN(16), K210_PIN(17), + K210_PIN(18), K210_PIN(19), K210_PIN(20), + K210_PIN(21), K210_PIN(22), K210_PIN(23), + K210_PIN(24), K210_PIN(25), K210_PIN(26), + K210_PIN(27), K210_PIN(28), K210_PIN(29), + K210_PIN(30), K210_PIN(31), K210_PIN(32), + K210_PIN(33), K210_PIN(34), K210_PIN(35), + K210_PIN(36), K210_PIN(37), K210_PIN(38), + K210_PIN(39), K210_PIN(40), K210_PIN(41), + K210_PIN(42), K210_PIN(43), K210_PIN(44), + K210_PIN(45), K210_PIN(46), K210_PIN(47) +}; + +#define K210_NPINS ARRAY_SIZE(k210_pins) + +/* + * Pin groups: each of the 48 programmable pins is a group. + * To this are added 8 power domain groups, which for the purposes of + * the pin subsystem, contain no pins. The power domain groups only exist + * to set the power level. The id should never be used (since there are + * no pins 48-55). + */ +static const char *const k210_group_names[] = { + /* The first 48 groups are for pins, one each */ + K210_PIN_NAME(0), K210_PIN_NAME(1), K210_PIN_NAME(2), + K210_PIN_NAME(3), K210_PIN_NAME(4), K210_PIN_NAME(5), + K210_PIN_NAME(6), K210_PIN_NAME(7), K210_PIN_NAME(8), + K210_PIN_NAME(9), K210_PIN_NAME(10), K210_PIN_NAME(11), + K210_PIN_NAME(12), K210_PIN_NAME(13), K210_PIN_NAME(14), + K210_PIN_NAME(15), K210_PIN_NAME(16), K210_PIN_NAME(17), + K210_PIN_NAME(18), K210_PIN_NAME(19), K210_PIN_NAME(20), + K210_PIN_NAME(21), K210_PIN_NAME(22), K210_PIN_NAME(23), + K210_PIN_NAME(24), K210_PIN_NAME(25), K210_PIN_NAME(26), + K210_PIN_NAME(27), K210_PIN_NAME(28), K210_PIN_NAME(29), + K210_PIN_NAME(30), K210_PIN_NAME(31), K210_PIN_NAME(32), + K210_PIN_NAME(33), K210_PIN_NAME(34), K210_PIN_NAME(35), + K210_PIN_NAME(36), K210_PIN_NAME(37), K210_PIN_NAME(38), + K210_PIN_NAME(39), K210_PIN_NAME(40), K210_PIN_NAME(41), + K210_PIN_NAME(42), K210_PIN_NAME(43), K210_PIN_NAME(44), + K210_PIN_NAME(45), K210_PIN_NAME(46), K210_PIN_NAME(47), + [48] = "A0", [49] = "A1", [50] = "A2", + [51] = "B3", [52] = "B4", [53] = "B5", + [54] = "C6", [55] = "C7" +}; + +#define K210_NGROUPS ARRAY_SIZE(k210_group_names) + +enum k210_pinctrl_mode_id { + K210_PC_DEFAULT_DISABLED, + K210_PC_DEFAULT_IN, + K210_PC_DEFAULT_IN_TIE, + K210_PC_DEFAULT_OUT, + K210_PC_DEFAULT_I2C, + K210_PC_DEFAULT_SCCB, + K210_PC_DEFAULT_SPI, + K210_PC_DEFAULT_GPIO, + K210_PC_DEFAULT_INT13, +}; + +#define K210_PC_DEFAULT(mode) \ + [K210_PC_DEFAULT_##mode] = K210_PC_MODE_##mode + +static const u32 k210_pinconf_mode_id_to_mode[] = { + [K210_PC_DEFAULT_DISABLED] = 0, + K210_PC_DEFAULT(IN), + [K210_PC_DEFAULT_IN_TIE] = K210_PC_MODE_IN, + K210_PC_DEFAULT(OUT), + K210_PC_DEFAULT(I2C), + K210_PC_DEFAULT(SCCB), + K210_PC_DEFAULT(SPI), + K210_PC_DEFAULT(GPIO), + [K210_PC_DEFAULT_INT13] = K210_PC_MODE_IN | K210_PC_PU, +}; + +#undef DEFAULT + +/* + * Pin functions configuration information. + */ +struct k210_pcf_info { + char name[15]; + u8 mode_id; +}; + +#define K210_FUNC(id, mode) \ + [K210_PCF_##id] = { \ + .name = #id, \ + .mode_id = K210_PC_DEFAULT_##mode \ + } + +static const struct k210_pcf_info k210_pcf_infos[] = { + K210_FUNC(JTAG_TCLK, IN), + K210_FUNC(JTAG_TDI, IN), + K210_FUNC(JTAG_TMS, IN), + K210_FUNC(JTAG_TDO, OUT), + K210_FUNC(SPI0_D0, SPI), + K210_FUNC(SPI0_D1, SPI), + K210_FUNC(SPI0_D2, SPI), + K210_FUNC(SPI0_D3, SPI), + K210_FUNC(SPI0_D4, SPI), + K210_FUNC(SPI0_D5, SPI), + K210_FUNC(SPI0_D6, SPI), + K210_FUNC(SPI0_D7, SPI), + K210_FUNC(SPI0_SS0, OUT), + K210_FUNC(SPI0_SS1, OUT), + K210_FUNC(SPI0_SS2, OUT), + K210_FUNC(SPI0_SS3, OUT), + K210_FUNC(SPI0_ARB, IN_TIE), + K210_FUNC(SPI0_SCLK, OUT), + K210_FUNC(UARTHS_RX, IN), + K210_FUNC(UARTHS_TX, OUT), + K210_FUNC(RESV6, IN), + K210_FUNC(RESV7, IN), + K210_FUNC(CLK_SPI1, OUT), + K210_FUNC(CLK_I2C1, OUT), + K210_FUNC(GPIOHS0, GPIO), + K210_FUNC(GPIOHS1, GPIO), + K210_FUNC(GPIOHS2, GPIO), + K210_FUNC(GPIOHS3, GPIO), + K210_FUNC(GPIOHS4, GPIO), + K210_FUNC(GPIOHS5, GPIO), + K210_FUNC(GPIOHS6, GPIO), + K210_FUNC(GPIOHS7, GPIO), + K210_FUNC(GPIOHS8, GPIO), + K210_FUNC(GPIOHS9, GPIO), + K210_FUNC(GPIOHS10, GPIO), + K210_FUNC(GPIOHS11, GPIO), + K210_FUNC(GPIOHS12, GPIO), + K210_FUNC(GPIOHS13, GPIO), + K210_FUNC(GPIOHS14, GPIO), + K210_FUNC(GPIOHS15, GPIO), + K210_FUNC(GPIOHS16, GPIO), + K210_FUNC(GPIOHS17, GPIO), + K210_FUNC(GPIOHS18, GPIO), + K210_FUNC(GPIOHS19, GPIO), + K210_FUNC(GPIOHS20, GPIO), + K210_FUNC(GPIOHS21, GPIO), + K210_FUNC(GPIOHS22, GPIO), + K210_FUNC(GPIOHS23, GPIO), + K210_FUNC(GPIOHS24, GPIO), + K210_FUNC(GPIOHS25, GPIO), + K210_FUNC(GPIOHS26, GPIO), + K210_FUNC(GPIOHS27, GPIO), + K210_FUNC(GPIOHS28, GPIO), + K210_FUNC(GPIOHS29, GPIO), + K210_FUNC(GPIOHS30, GPIO), + K210_FUNC(GPIOHS31, GPIO), + K210_FUNC(GPIO0, GPIO), + K210_FUNC(GPIO1, GPIO), + K210_FUNC(GPIO2, GPIO), + K210_FUNC(GPIO3, GPIO), + K210_FUNC(GPIO4, GPIO), + K210_FUNC(GPIO5, GPIO), + K210_FUNC(GPIO6, GPIO), + K210_FUNC(GPIO7, GPIO), + K210_FUNC(UART1_RX, IN), + K210_FUNC(UART1_TX, OUT), + K210_FUNC(UART2_RX, IN), + K210_FUNC(UART2_TX, OUT), + K210_FUNC(UART3_RX, IN), + K210_FUNC(UART3_TX, OUT), + K210_FUNC(SPI1_D0, SPI), + K210_FUNC(SPI1_D1, SPI), + K210_FUNC(SPI1_D2, SPI), + K210_FUNC(SPI1_D3, SPI), + K210_FUNC(SPI1_D4, SPI), + K210_FUNC(SPI1_D5, SPI), + K210_FUNC(SPI1_D6, SPI), + K210_FUNC(SPI1_D7, SPI), + K210_FUNC(SPI1_SS0, OUT), + K210_FUNC(SPI1_SS1, OUT), + K210_FUNC(SPI1_SS2, OUT), + K210_FUNC(SPI1_SS3, OUT), + K210_FUNC(SPI1_ARB, IN_TIE), + K210_FUNC(SPI1_SCLK, OUT), + K210_FUNC(SPI2_D0, SPI), + K210_FUNC(SPI2_SS, IN), + K210_FUNC(SPI2_SCLK, IN), + K210_FUNC(I2S0_MCLK, OUT), + K210_FUNC(I2S0_SCLK, OUT), + K210_FUNC(I2S0_WS, OUT), + K210_FUNC(I2S0_IN_D0, IN), + K210_FUNC(I2S0_IN_D1, IN), + K210_FUNC(I2S0_IN_D2, IN), + K210_FUNC(I2S0_IN_D3, IN), + K210_FUNC(I2S0_OUT_D0, OUT), + K210_FUNC(I2S0_OUT_D1, OUT), + K210_FUNC(I2S0_OUT_D2, OUT), + K210_FUNC(I2S0_OUT_D3, OUT), + K210_FUNC(I2S1_MCLK, OUT), + K210_FUNC(I2S1_SCLK, OUT), + K210_FUNC(I2S1_WS, OUT), + K210_FUNC(I2S1_IN_D0, IN), + K210_FUNC(I2S1_IN_D1, IN), + K210_FUNC(I2S1_IN_D2, IN), + K210_FUNC(I2S1_IN_D3, IN), + K210_FUNC(I2S1_OUT_D0, OUT), + K210_FUNC(I2S1_OUT_D1, OUT), + K210_FUNC(I2S1_OUT_D2, OUT), + K210_FUNC(I2S1_OUT_D3, OUT), + K210_FUNC(I2S2_MCLK, OUT), + K210_FUNC(I2S2_SCLK, OUT), + K210_FUNC(I2S2_WS, OUT), + K210_FUNC(I2S2_IN_D0, IN), + K210_FUNC(I2S2_IN_D1, IN), + K210_FUNC(I2S2_IN_D2, IN), + K210_FUNC(I2S2_IN_D3, IN), + K210_FUNC(I2S2_OUT_D0, OUT), + K210_FUNC(I2S2_OUT_D1, OUT), + K210_FUNC(I2S2_OUT_D2, OUT), + K210_FUNC(I2S2_OUT_D3, OUT), + K210_FUNC(RESV0, DISABLED), + K210_FUNC(RESV1, DISABLED), + K210_FUNC(RESV2, DISABLED), + K210_FUNC(RESV3, DISABLED), + K210_FUNC(RESV4, DISABLED), + K210_FUNC(RESV5, DISABLED), + K210_FUNC(I2C0_SCLK, I2C), + K210_FUNC(I2C0_SDA, I2C), + K210_FUNC(I2C1_SCLK, I2C), + K210_FUNC(I2C1_SDA, I2C), + K210_FUNC(I2C2_SCLK, I2C), + K210_FUNC(I2C2_SDA, I2C), + K210_FUNC(DVP_XCLK, OUT), + K210_FUNC(DVP_RST, OUT), + K210_FUNC(DVP_PWDN, OUT), + K210_FUNC(DVP_VSYNC, IN), + K210_FUNC(DVP_HSYNC, IN), + K210_FUNC(DVP_PCLK, IN), + K210_FUNC(DVP_D0, IN), + K210_FUNC(DVP_D1, IN), + K210_FUNC(DVP_D2, IN), + K210_FUNC(DVP_D3, IN), + K210_FUNC(DVP_D4, IN), + K210_FUNC(DVP_D5, IN), + K210_FUNC(DVP_D6, IN), + K210_FUNC(DVP_D7, IN), + K210_FUNC(SCCB_SCLK, SCCB), + K210_FUNC(SCCB_SDA, SCCB), + K210_FUNC(UART1_CTS, IN), + K210_FUNC(UART1_DSR, IN), + K210_FUNC(UART1_DCD, IN), + K210_FUNC(UART1_RI, IN), + K210_FUNC(UART1_SIR_IN, IN), + K210_FUNC(UART1_DTR, OUT), + K210_FUNC(UART1_RTS, OUT), + K210_FUNC(UART1_OUT2, OUT), + K210_FUNC(UART1_OUT1, OUT), + K210_FUNC(UART1_SIR_OUT, OUT), + K210_FUNC(UART1_BAUD, OUT), + K210_FUNC(UART1_RE, OUT), + K210_FUNC(UART1_DE, OUT), + K210_FUNC(UART1_RS485_EN, OUT), + K210_FUNC(UART2_CTS, IN), + K210_FUNC(UART2_DSR, IN), + K210_FUNC(UART2_DCD, IN), + K210_FUNC(UART2_RI, IN), + K210_FUNC(UART2_SIR_IN, IN), + K210_FUNC(UART2_DTR, OUT), + K210_FUNC(UART2_RTS, OUT), + K210_FUNC(UART2_OUT2, OUT), + K210_FUNC(UART2_OUT1, OUT), + K210_FUNC(UART2_SIR_OUT, OUT), + K210_FUNC(UART2_BAUD, OUT), + K210_FUNC(UART2_RE, OUT), + K210_FUNC(UART2_DE, OUT), + K210_FUNC(UART2_RS485_EN, OUT), + K210_FUNC(UART3_CTS, IN), + K210_FUNC(UART3_DSR, IN), + K210_FUNC(UART3_DCD, IN), + K210_FUNC(UART3_RI, IN), + K210_FUNC(UART3_SIR_IN, IN), + K210_FUNC(UART3_DTR, OUT), + K210_FUNC(UART3_RTS, OUT), + K210_FUNC(UART3_OUT2, OUT), + K210_FUNC(UART3_OUT1, OUT), + K210_FUNC(UART3_SIR_OUT, OUT), + K210_FUNC(UART3_BAUD, OUT), + K210_FUNC(UART3_RE, OUT), + K210_FUNC(UART3_DE, OUT), + K210_FUNC(UART3_RS485_EN, OUT), + K210_FUNC(TIMER0_TOGGLE1, OUT), + K210_FUNC(TIMER0_TOGGLE2, OUT), + K210_FUNC(TIMER0_TOGGLE3, OUT), + K210_FUNC(TIMER0_TOGGLE4, OUT), + K210_FUNC(TIMER1_TOGGLE1, OUT), + K210_FUNC(TIMER1_TOGGLE2, OUT), + K210_FUNC(TIMER1_TOGGLE3, OUT), + K210_FUNC(TIMER1_TOGGLE4, OUT), + K210_FUNC(TIMER2_TOGGLE1, OUT), + K210_FUNC(TIMER2_TOGGLE2, OUT), + K210_FUNC(TIMER2_TOGGLE3, OUT), + K210_FUNC(TIMER2_TOGGLE4, OUT), + K210_FUNC(CLK_SPI2, OUT), + K210_FUNC(CLK_I2C2, OUT), + K210_FUNC(INTERNAL0, OUT), + K210_FUNC(INTERNAL1, OUT), + K210_FUNC(INTERNAL2, OUT), + K210_FUNC(INTERNAL3, OUT), + K210_FUNC(INTERNAL4, OUT), + K210_FUNC(INTERNAL5, OUT), + K210_FUNC(INTERNAL6, OUT), + K210_FUNC(INTERNAL7, OUT), + K210_FUNC(INTERNAL8, OUT), + K210_FUNC(INTERNAL9, IN), + K210_FUNC(INTERNAL10, IN), + K210_FUNC(INTERNAL11, IN), + K210_FUNC(INTERNAL12, IN), + K210_FUNC(INTERNAL13, INT13), + K210_FUNC(INTERNAL14, I2C), + K210_FUNC(INTERNAL15, IN), + K210_FUNC(INTERNAL16, IN), + K210_FUNC(INTERNAL17, IN), + K210_FUNC(CONSTANT, DISABLED), + K210_FUNC(INTERNAL18, IN), + K210_FUNC(DEBUG0, OUT), + K210_FUNC(DEBUG1, OUT), + K210_FUNC(DEBUG2, OUT), + K210_FUNC(DEBUG3, OUT), + K210_FUNC(DEBUG4, OUT), + K210_FUNC(DEBUG5, OUT), + K210_FUNC(DEBUG6, OUT), + K210_FUNC(DEBUG7, OUT), + K210_FUNC(DEBUG8, OUT), + K210_FUNC(DEBUG9, OUT), + K210_FUNC(DEBUG10, OUT), + K210_FUNC(DEBUG11, OUT), + K210_FUNC(DEBUG12, OUT), + K210_FUNC(DEBUG13, OUT), + K210_FUNC(DEBUG14, OUT), + K210_FUNC(DEBUG15, OUT), + K210_FUNC(DEBUG16, OUT), + K210_FUNC(DEBUG17, OUT), + K210_FUNC(DEBUG18, OUT), + K210_FUNC(DEBUG19, OUT), + K210_FUNC(DEBUG20, OUT), + K210_FUNC(DEBUG21, OUT), + K210_FUNC(DEBUG22, OUT), + K210_FUNC(DEBUG23, OUT), + K210_FUNC(DEBUG24, OUT), + K210_FUNC(DEBUG25, OUT), + K210_FUNC(DEBUG26, OUT), + K210_FUNC(DEBUG27, OUT), + K210_FUNC(DEBUG28, OUT), + K210_FUNC(DEBUG29, OUT), + K210_FUNC(DEBUG30, OUT), + K210_FUNC(DEBUG31, OUT), +}; + +#define PIN_CONFIG_OUTPUT_INVERT (PIN_CONFIG_END + 1) +#define PIN_CONFIG_INPUT_INVERT (PIN_CONFIG_END + 2) + +static const struct pinconf_generic_params k210_pinconf_custom_params[] = { + { "output-polarity-invert", PIN_CONFIG_OUTPUT_INVERT, 1 }, + { "input-polarity-invert", PIN_CONFIG_INPUT_INVERT, 1 }, +}; + +/* + * Max drive strength in uA. + */ +static const int k210_pinconf_drive_strength[] = { + [0] = 11200, + [1] = 16800, + [2] = 22300, + [3] = 27800, + [4] = 33300, + [5] = 38700, + [6] = 44100, + [7] = 49500, +}; + +static int k210_pinconf_get_drive(unsigned int max_strength_ua) +{ + int i; + + for (i = K210_PC_DRIVE_MAX; i; i--) { + if (k210_pinconf_drive_strength[i] <= max_strength_ua) + return i; + } + + return -EINVAL; +} + +static void k210_pinmux_set_pin_function(struct pinctrl_dev *pctldev, + u32 pin, u32 func) +{ + struct k210_fpioa_data *pdata = pinctrl_dev_get_drvdata(pctldev); + const struct k210_pcf_info *info = &k210_pcf_infos[func]; + u32 mode = k210_pinconf_mode_id_to_mode[info->mode_id]; + u32 val = func | mode; + + dev_dbg(pdata->dev, "set pin %u function %s (%u) -> 0x%08x\n", + pin, info->name, func, val); + + writel(val, &pdata->fpioa->pins[pin]); +} + +static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, + unsigned int pin, + unsigned int param, unsigned int arg) +{ + struct k210_fpioa_data *pdata = pinctrl_dev_get_drvdata(pctldev); + u32 val = readl(&pdata->fpioa->pins[pin]); + int drive; + + dev_dbg(pdata->dev, "set pin %u param %u, arg 0x%x\n", + pin, param, arg); + + switch (param) { + case PIN_CONFIG_BIAS_DISABLE: + val &= ~K210_PC_BIAS_MASK; + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + if (!arg) + return -EINVAL; + val |= K210_PC_PD; + break; + case PIN_CONFIG_BIAS_PULL_UP: + if (!arg) + return -EINVAL; + val |= K210_PC_PD; + break; + case PIN_CONFIG_DRIVE_STRENGTH: + arg *= 1000; + fallthrough; + case PIN_CONFIG_DRIVE_STRENGTH_UA: + drive = k210_pinconf_get_drive(arg); + if (drive < 0) + return drive; + val &= ~K210_PC_DRIVE_MASK; + val |= FIELD_PREP(K210_PC_DRIVE_MASK, drive); + break; + case PIN_CONFIG_INPUT_ENABLE: + if (arg) + val |= K210_PC_IE; + else + val &= ~K210_PC_IE; + break; + case PIN_CONFIG_INPUT_SCHMITT_ENABLE: + if (arg) + val |= K210_PC_ST; + else + val &= ~K210_PC_ST; + break; + case PIN_CONFIG_OUTPUT: + k210_pinmux_set_pin_function(pctldev, pin, K210_PCF_CONSTANT); + val = readl(&pdata->fpioa->pins[pin]); + val |= K210_PC_MODE_OUT; + if (!arg) + val |= K210_PC_DO_INV; + break; + case PIN_CONFIG_OUTPUT_ENABLE: + if (arg) + val |= K210_PC_OE; + else + val &= ~K210_PC_OE; + break; + case PIN_CONFIG_SLEW_RATE: + if (arg) + val |= K210_PC_SL; + else + val &= ~K210_PC_SL; + break; + case PIN_CONFIG_OUTPUT_INVERT: + if (arg) + val |= K210_PC_DO_INV; + else + val &= ~K210_PC_DO_INV; + break; + case PIN_CONFIG_INPUT_INVERT: + if (arg) + val |= K210_PC_DI_INV; + else + val &= ~K210_PC_DI_INV; + break; + default: + return -EINVAL; + } + + writel(val, &pdata->fpioa->pins[pin]); + + return 0; +} + +static int k210_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, + unsigned long *configs, unsigned int num_configs) +{ + unsigned int param, arg; + int i, ret; + + if (WARN_ON(pin >= K210_NPINS)) + return -EINVAL; + + for (i = 0; i < num_configs; i++) { + param = pinconf_to_config_param(configs[i]); + arg = pinconf_to_config_argument(configs[i]); + ret = k210_pinconf_set_param(pctldev, pin, param, arg); + if (ret) + return ret; + } + + return 0; +} + +static void k210_pinconf_dbg_show(struct pinctrl_dev *pctldev, + struct seq_file *s, unsigned int pin) +{ + struct k210_fpioa_data *pdata = pinctrl_dev_get_drvdata(pctldev); + + seq_printf(s, "%#x", readl(&pdata->fpioa->pins[pin])); +} + +static int k210_pinconf_group_set(struct pinctrl_dev *pctldev, + unsigned int selector, unsigned long *configs, + unsigned int num_configs) +{ + struct k210_fpioa_data *pdata = pinctrl_dev_get_drvdata(pctldev); + unsigned int param, arg; + u32 bit; + int i; + + /* Pins should be configured with pinmux, not groups*/ + if (selector < K210_NPINS) + return -EINVAL; + + /* Otherwise it's a power domain */ + for (i = 0; i < num_configs; i++) { + param = pinconf_to_config_param(configs[i]); + if (param != PIN_CONFIG_POWER_SOURCE) + return -EINVAL; + + arg = pinconf_to_config_argument(configs[i]); + bit = BIT(selector - K210_NPINS); + regmap_update_bits(pdata->sysctl_map, + pdata->power_offset, + bit, arg ? bit : 0); + } + + return 0; +} + +static void k210_pinconf_group_dbg_show(struct pinctrl_dev *pctldev, + struct seq_file *s, + unsigned int selector) +{ + struct k210_fpioa_data *pdata = pinctrl_dev_get_drvdata(pctldev); + int ret; + u32 val; + + if (selector < K210_NPINS) + return k210_pinconf_dbg_show(pctldev, s, selector); + + ret = regmap_read(pdata->sysctl_map, pdata->power_offset, &val); + if (ret) { + dev_err(pdata->dev, "Failed to read power reg\n"); + return; + } + + seq_printf(s, "%s: %s V", k210_group_names[selector], + val & BIT(selector - K210_NPINS) ? "1.8" : "3.3"); +} + +static const struct pinconf_ops k210_pinconf_ops = { + .is_generic = true, + .pin_config_set = k210_pinconf_set, + .pin_config_group_set = k210_pinconf_group_set, + .pin_config_dbg_show = k210_pinconf_dbg_show, + .pin_config_group_dbg_show = k210_pinconf_group_dbg_show, +}; + +static int k210_pinmux_get_function_count(struct pinctrl_dev *pctldev) +{ + return ARRAY_SIZE(k210_pcf_infos); +} + +static const char *k210_pinmux_get_function_name(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + return k210_pcf_infos[selector].name; +} + +static int k210_pinmux_get_function_groups(struct pinctrl_dev *pctldev, + unsigned int selector, + const char * const **groups, + unsigned int * const num_groups) +{ + /* Any function can be mapped to any pin */ + *groups = k210_group_names; + *num_groups = K210_NPINS; + + return 0; +} + +static int k210_pinmux_set_mux(struct pinctrl_dev *pctldev, + unsigned int function, + unsigned int group) +{ + /* Can't mux power domains */ + if (group >= K210_NPINS) + return -EINVAL; + + k210_pinmux_set_pin_function(pctldev, group, function); + + return 0; +} + +static const struct pinmux_ops k210_pinmux_ops = { + .get_functions_count = k210_pinmux_get_function_count, + .get_function_name = k210_pinmux_get_function_name, + .get_function_groups = k210_pinmux_get_function_groups, + .set_mux = k210_pinmux_set_mux, + .strict = true, +}; + +static int k210_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) +{ + return K210_NGROUPS; +} + +static const char *k210_pinctrl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int group) +{ + return k210_group_names[group]; +} + +static int k210_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int group, + const unsigned int **pins, + unsigned int *npins) +{ + if (group >= K210_NPINS) { + *pins = NULL; + *npins = 0; + return 0; + } + + *pins = &k210_pins[group].number; + *npins = 1; + + return 0; +} + +static void k210_pinctrl_pin_dbg_show(struct pinctrl_dev *pctldev, + struct seq_file *s, unsigned int offset) +{ + seq_printf(s, "%s", dev_name(pctldev->dev)); +} + +static int k210_pinctrl_dt_subnode_to_map(struct pinctrl_dev *pctldev, + struct device_node *np, + struct pinctrl_map **map, + unsigned int *reserved_maps, + unsigned int *num_maps) +{ + struct property *prop; + const __be32 *p; + int ret, pinmux_groups; + u32 pinmux_group; + unsigned long *configs = NULL; + unsigned int num_configs = 0; + unsigned int reserve = 0; + + ret = of_property_count_strings(np, "groups"); + if (!ret) + return pinconf_generic_dt_subnode_to_map(pctldev, np, map, + reserved_maps, num_maps, + PIN_MAP_TYPE_CONFIGS_GROUP); + + pinmux_groups = of_property_count_u32_elems(np, "pinmux"); + if (pinmux_groups <= 0) { + /* Ignore this node */ + return 0; + } + + ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, + &num_configs); + if (ret < 0) { + dev_err(pctldev->dev, "%pOF: could not parse node property\n", + np); + return ret; + } + + reserve = pinmux_groups * (1 + num_configs); + ret = pinctrl_utils_reserve_map(pctldev, map, reserved_maps, num_maps, + reserve); + if (ret < 0) + goto exit; + + of_property_for_each_u32(np, "pinmux", prop, p, pinmux_group) { + const char *group_name, *func_name; + u32 pin = FIELD_GET(K210_PG_PIN, pinmux_group); + u32 func = FIELD_GET(K210_PG_FUNC, pinmux_group); + + if (pin >= K210_NPINS) { + ret = -EINVAL; + goto exit; + } + + group_name = k210_group_names[pin]; + func_name = k210_pcf_infos[func].name; + + dev_dbg(pctldev->dev, "Pinmux %s: pin %u func %s\n", + np->name, pin, func_name); + + ret = pinctrl_utils_add_map_mux(pctldev, map, reserved_maps, + num_maps, group_name, + func_name); + if (ret < 0) { + dev_err(pctldev->dev, "%pOF add mux map failed %d\n", + np, ret); + goto exit; + } + + if (num_configs) { + ret = pinctrl_utils_add_map_configs(pctldev, map, + reserved_maps, num_maps, group_name, + configs, num_configs, + PIN_MAP_TYPE_CONFIGS_PIN); + if (ret < 0) { + dev_err(pctldev->dev, + "%pOF add configs map failed %d\n", + np, ret); + goto exit; + } + } + } + + ret = 0; + +exit: + kfree(configs); + return ret; +} + +static int k210_pinctrl_dt_node_to_map(struct pinctrl_dev *pctldev, + struct device_node *np_config, + struct pinctrl_map **map, + unsigned int *num_maps) +{ + unsigned int reserved_maps; + struct device_node *np; + int ret; + + reserved_maps = 0; + *map = NULL; + *num_maps = 0; + + ret = k210_pinctrl_dt_subnode_to_map(pctldev, np_config, map, + &reserved_maps, num_maps); + if (ret < 0) + goto err; + + for_each_available_child_of_node(np_config, np) { + ret = k210_pinctrl_dt_subnode_to_map(pctldev, np, map, + &reserved_maps, num_maps); + if (ret < 0) + goto err; + } + return 0; + +err: + pinctrl_utils_free_map(pctldev, *map, *num_maps); + return ret; +} + + +static const struct pinctrl_ops k210_pinctrl_ops = { + .get_groups_count = k210_pinctrl_get_groups_count, + .get_group_name = k210_pinctrl_get_group_name, + .get_group_pins = k210_pinctrl_get_group_pins, + .pin_dbg_show = k210_pinctrl_pin_dbg_show, + .dt_node_to_map = k210_pinctrl_dt_node_to_map, + .dt_free_map = pinconf_generic_dt_free_map, +}; + +static struct pinctrl_desc k210_pinctrl_desc = { + .name = "k210-pinctrl", + .pins = k210_pins, + .npins = K210_NPINS, + .pctlops = &k210_pinctrl_ops, + .pmxops = &k210_pinmux_ops, + .confops = &k210_pinconf_ops, + .custom_params = k210_pinconf_custom_params, + .num_custom_params = ARRAY_SIZE(k210_pinconf_custom_params), +}; + +static void k210_fpioa_init_ties(struct k210_fpioa_data *pdata) +{ + struct k210_fpioa __iomem *fpioa = pdata->fpioa; + u32 val; + int i, j; + + dev_dbg(pdata->dev, "Init pin ties\n"); + + /* Init pin functions input ties */ + for (i = 0; i < ARRAY_SIZE(fpioa->tie_en); i++) { + val = 0; + for (j = 0; j < 32; j++) { + if (k210_pcf_infos[i * 32 + j].mode_id == + K210_PC_DEFAULT_IN_TIE) { + dev_dbg(pdata->dev, + "tie_en function %d (%s)\n", + i * 32 + j, + k210_pcf_infos[i * 32 + j].name); + val |= BIT(j); + } + } + + /* Set value before enable */ + writel(val, &fpioa->tie_val[i]); + writel(val, &fpioa->tie_en[i]); + } +} + +static int k210_fpioa_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct k210_fpioa_data *pdata; + int ret; + + dev_info(dev, "K210 FPIOA pin controller\n"); + + pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + pdata->dev = dev; + platform_set_drvdata(pdev, pdata); + + pdata->fpioa = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(pdata->fpioa)) + return PTR_ERR(pdata->fpioa); + + pdata->clk = devm_clk_get(dev, "ref"); + if (IS_ERR(pdata->clk)) + return PTR_ERR(pdata->clk); + + ret = clk_prepare_enable(pdata->clk); + if (ret) + return ret; + + pdata->pclk = devm_clk_get_optional(dev, "pclk"); + if (!IS_ERR(pdata->pclk)) + clk_prepare_enable(pdata->pclk); + + pdata->sysctl_map = + syscon_regmap_lookup_by_phandle_args(np, + "canaan,k210-sysctl-power", + 1, &pdata->power_offset); + if (IS_ERR(pdata->sysctl_map)) + return PTR_ERR(pdata->sysctl_map); + + k210_fpioa_init_ties(pdata); + + pdata->pctl = pinctrl_register(&k210_pinctrl_desc, dev, (void *)pdata); + if (IS_ERR(pdata->pctl)) + return PTR_ERR(pdata->pctl); + + return 0; +} + +static const struct of_device_id k210_fpioa_dt_ids[] = { + { .compatible = "canaan,k210-fpioa" }, + { /* sentinel */ }, +}; + +static struct platform_driver k210_fpioa_driver = { + .probe = k210_fpioa_probe, + .driver = { + .name = "k210-fpioa", + .of_match_table = k210_fpioa_dt_ids, + }, +}; +builtin_platform_driver(k210_fpioa_driver); From 5dd671333171d1ba44c16e1404f72788412e36f4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 22 Jan 2021 19:34:29 -0800 Subject: [PATCH 170/506] RISC-V: probes: Treat the instruction stream as host-endian Neither of these are actually correct: the instruction stream is defined (for versions of the ISA manual newer than 2.2) as a stream of 16-bit little-endian parcels, which is different than just being little-endian. In theory we should represent this as a type, but we don't have any concrete plans for the big endian stuff so it doesn't seem worth the time -- we've got variants of this all over the place. Instead I'm just dropping the unnecessary type conversion, which is a NOP on LE systems but causes an sparse error as the types are all mixed up. Reported-by: kernel test robot Signed-off-by: Palmer Dabbelt Acked-by: Guo Ren Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/decode-insn.c | 2 +- arch/riscv/kernel/probes/kprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 0876c304ca77..0ed043acc882 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -16,7 +16,7 @@ enum probe_insn __kprobes riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) { - probe_opcode_t insn = le32_to_cpu(*addr); + probe_opcode_t insn = *addr; /* * Reject instructions list: diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index e60893bd87db..a2ec18662fee 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -57,7 +57,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) } /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; /* decode instruction */ switch (riscv_probe_decode_insn(p->addr, &p->ainsn.api)) { From 3449831d92fea50b470d5b22435cfeaf15a6dd54 Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Mon, 25 Jan 2021 19:23:47 +0800 Subject: [PATCH 171/506] RISC-V: remove unneeded semicolon Remove a superfluous semicolon after function definition. Signed-off-by: Chengyang Fan Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/set_memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 211eb8244a45..9fa510707012 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -22,7 +22,7 @@ static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } -static inline void protect_kernel_text_data(void) {}; +static inline void protect_kernel_text_data(void) {} static inline int set_memory_rw_nx(unsigned long addr, int numpages) { return 0; } #endif From 4cd48bb3b07730214d4e56abd6030c5159eb2572 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Jan 2021 19:55:33 -0800 Subject: [PATCH 172/506] arch_numa: fix common code printing of phys_addr_t Fix build warnings in the arch_numa common code: ../include/linux/kern_levels.h:5:18: warning: format '%Lx' expects argument of type 'long long unsigned int', but argument 3 has type 'phys_addr_t' {aka 'unsigned int'} [-Wformat=] ../drivers/base/arch_numa.c:360:56: note: format string is defined here 360 | pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", ../drivers/base/arch_numa.c:435:39: note: format string is defined here 435 | pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1); Fixes: ae3c107cd8be ("numa: Move numa implementation to common code") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- drivers/base/arch_numa.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 0dae54ce7d43..4cc4e117727d 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -355,11 +355,12 @@ static int __init numa_register_nodes(void) /* Check that valid nid is set to memblks */ for_each_mem_region(mblk) { int mblk_nid = memblock_get_region_node(mblk); + phys_addr_t start = mblk->base; + phys_addr_t end = mblk->base + mblk->size - 1; if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { - pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - mblk_nid, mblk->base, - mblk->base + mblk->size - 1); + pr_warn("Warning: invalid memblk node %d [mem %pap-%pap]\n", + mblk_nid, &start, &end); return -EINVAL; } } @@ -427,14 +428,14 @@ static int __init numa_init(int (*init_func)(void)) static int __init dummy_numa_init(void) { phys_addr_t start = memblock_start_of_DRAM(); - phys_addr_t end = memblock_end_of_DRAM(); + phys_addr_t end = memblock_end_of_DRAM() - 1; int ret; if (numa_off) pr_info("NUMA disabled\n"); /* Forced off on command line. */ - pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1); + pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end); - ret = numa_add_memblk(0, start, end); + ret = numa_add_memblk(0, start, end + 1); if (ret) { pr_err("NUMA init failed\n"); return ret; From 65d4b9c5301749d18b5ec1323fdefecefab72687 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 3 Feb 2021 15:19:07 +0530 Subject: [PATCH 173/506] RISC-V: Implement ASID allocator Currently, we do local TLB flush on every MM switch. This is very harsh on performance because we are forcing page table walks after every MM switch. This patch implements ASID allocator for assigning an ASID to a MM context. The number of ASIDs are limited in HW so we create a logical entity named CONTEXTID for assigning to MM context. The lower bits of CONTEXTID are ASID and upper bits are VERSION number. The number of usable ASID bits supported by HW are detected at boot-time by writing 1s to ASID bits in SATP CSR. We allocate new CONTEXTID on first MM switch for a MM context where the ASID is allocated from an ASID bitmap and VERSION is provide by an atomic counter. At time of allocating new CONTEXTID, if we run out of available ASIDs then: 1. We flush the ASID bitmap 2. Increment current VERSION atomic counter 3. Re-allocate ASID from ASID bitmap 4. Flush TLB on all CPUs 5. Try CONTEXTID re-assignment on all CPUs Please note that we don't use ASID #0 because it is used at boot-time by all CPUs for initial MM context. Also, newly created context is always assigned CONTEXTID #0 (i.e. VERSION #0 and ASID #0) which is an invalid context in our implementation. Using above approach, we have virtually infinite CONTEXTIDs on-top-of limited number of HW ASIDs. This approach is inspired from ASID allocator used for Linux ARM/ARM64 but we have adapted it for RISC-V. Overall, this ASID allocator helps us reduce rate of local TLB flushes on every CPU thereby increasing performance. This patch is tested on QEMU virt machine, Spike and SiFive Unleashed board. On QEMU virt machine, we see some (3-5% approx) performance improvement with SW emulated TLBs provided by QEMU. Unfortunately, the ASID bits of the SATP CSR are not implemented on Spike and SiFive Unleashed board so we don't see any change in performance. On real HW having all ASID bits implemented, the performance gains will be much more due improved sharing of TLB among different processes. Signed-off-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/csr.h | 6 + arch/riscv/include/asm/mmu.h | 2 + arch/riscv/include/asm/mmu_context.h | 10 + arch/riscv/mm/context.c | 265 ++++++++++++++++++++++++++- 4 files changed, 279 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index cec462e198ce..caadfc1d7487 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -41,10 +41,16 @@ #define SATP_PPN _AC(0x003FFFFF, UL) #define SATP_MODE_32 _AC(0x80000000, UL) #define SATP_MODE SATP_MODE_32 +#define SATP_ASID_BITS 9 +#define SATP_ASID_SHIFT 22 +#define SATP_ASID_MASK _AC(0x1FF, UL) #else #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) #define SATP_MODE_39 _AC(0x8000000000000000, UL) #define SATP_MODE SATP_MODE_39 +#define SATP_ASID_BITS 16 +#define SATP_ASID_SHIFT 44 +#define SATP_ASID_MASK _AC(0xFFFF, UL) #endif /* Exception cause high bit - is an interrupt if set */ diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index dabcf2cfb3dc..0099dc116168 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -12,6 +12,8 @@ typedef struct { #ifndef CONFIG_MMU unsigned long end_brk; +#else + atomic_long_t id; #endif void *vdso; #ifdef CONFIG_SMP diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h index 250defa06f3a..b0659413a080 100644 --- a/arch/riscv/include/asm/mmu_context.h +++ b/arch/riscv/include/asm/mmu_context.h @@ -23,6 +23,16 @@ static inline void activate_mm(struct mm_struct *prev, switch_mm(prev, next, NULL); } +#define init_new_context init_new_context +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ +#ifdef CONFIG_MMU + atomic_long_set(&mm->context.id, 0); +#endif + return 0; +} + #include #endif /* _ASM_RISCV_MMU_CONTEXT_H */ diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 613ec81a8979..68aa312fc352 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -2,13 +2,273 @@ /* * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive + * Copyright (C) 2021 Western Digital Corporation or its affiliates. */ +#include +#include #include +#include +#include +#include +#include #include #include #include +#ifdef CONFIG_MMU + +static DEFINE_STATIC_KEY_FALSE(use_asid_allocator); + +static unsigned long asid_bits; +static unsigned long num_asids; +static unsigned long asid_mask; + +static atomic_long_t current_version; + +static DEFINE_RAW_SPINLOCK(context_lock); +static cpumask_t context_tlb_flush_pending; +static unsigned long *context_asid_map; + +static DEFINE_PER_CPU(atomic_long_t, active_context); +static DEFINE_PER_CPU(unsigned long, reserved_context); + +static bool check_update_reserved_context(unsigned long cntx, + unsigned long newcntx) +{ + int cpu; + bool hit = false; + + /* + * Iterate over the set of reserved CONTEXT looking for a match. + * If we find one, then we can update our mm to use new CONTEXT + * (i.e. the same CONTEXT in the current_version) but we can't + * exit the loop early, since we need to ensure that all copies + * of the old CONTEXT are updated to reflect the mm. Failure to do + * so could result in us missing the reserved CONTEXT in a future + * version. + */ + for_each_possible_cpu(cpu) { + if (per_cpu(reserved_context, cpu) == cntx) { + hit = true; + per_cpu(reserved_context, cpu) = newcntx; + } + } + + return hit; +} + +static void __flush_context(void) +{ + int i; + unsigned long cntx; + + /* Must be called with context_lock held */ + lockdep_assert_held(&context_lock); + + /* Update the list of reserved ASIDs and the ASID bitmap. */ + bitmap_clear(context_asid_map, 0, num_asids); + + /* Mark already active ASIDs as used */ + for_each_possible_cpu(i) { + cntx = atomic_long_xchg_relaxed(&per_cpu(active_context, i), 0); + /* + * If this CPU has already been through a rollover, but + * hasn't run another task in the meantime, we must preserve + * its reserved CONTEXT, as this is the only trace we have of + * the process it is still running. + */ + if (cntx == 0) + cntx = per_cpu(reserved_context, i); + + __set_bit(cntx & asid_mask, context_asid_map); + per_cpu(reserved_context, i) = cntx; + } + + /* Mark ASID #0 as used because it is used at boot-time */ + __set_bit(0, context_asid_map); + + /* Queue a TLB invalidation for each CPU on next context-switch */ + cpumask_setall(&context_tlb_flush_pending); +} + +static unsigned long __new_context(struct mm_struct *mm) +{ + static u32 cur_idx = 1; + unsigned long cntx = atomic_long_read(&mm->context.id); + unsigned long asid, ver = atomic_long_read(¤t_version); + + /* Must be called with context_lock held */ + lockdep_assert_held(&context_lock); + + if (cntx != 0) { + unsigned long newcntx = ver | (cntx & asid_mask); + + /* + * If our current CONTEXT was active during a rollover, we + * can continue to use it and this was just a false alarm. + */ + if (check_update_reserved_context(cntx, newcntx)) + return newcntx; + + /* + * We had a valid CONTEXT in a previous life, so try to + * re-use it if possible. + */ + if (!__test_and_set_bit(cntx & asid_mask, context_asid_map)) + return newcntx; + } + + /* + * Allocate a free ASID. If we can't find one then increment + * current_version and flush all ASIDs. + */ + asid = find_next_zero_bit(context_asid_map, num_asids, cur_idx); + if (asid != num_asids) + goto set_asid; + + /* We're out of ASIDs, so increment current_version */ + ver = atomic_long_add_return_relaxed(num_asids, ¤t_version); + + /* Flush everything */ + __flush_context(); + + /* We have more ASIDs than CPUs, so this will always succeed */ + asid = find_next_zero_bit(context_asid_map, num_asids, 1); + +set_asid: + __set_bit(asid, context_asid_map); + cur_idx = asid; + return asid | ver; +} + +static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) +{ + unsigned long flags; + bool need_flush_tlb = false; + unsigned long cntx, old_active_cntx; + + cntx = atomic_long_read(&mm->context.id); + + /* + * If our active_context is non-zero and the context matches the + * current_version, then we update the active_context entry with a + * relaxed cmpxchg. + * + * Following is how we handle racing with a concurrent rollover: + * + * - We get a zero back from the cmpxchg and end up waiting on the + * lock. Taking the lock synchronises with the rollover and so + * we are forced to see the updated verion. + * + * - We get a valid context back from the cmpxchg then we continue + * using old ASID because __flush_context() would have marked ASID + * of active_context as used and next context switch we will + * allocate new context. + */ + old_active_cntx = atomic_long_read(&per_cpu(active_context, cpu)); + if (old_active_cntx && + ((cntx & ~asid_mask) == atomic_long_read(¤t_version)) && + atomic_long_cmpxchg_relaxed(&per_cpu(active_context, cpu), + old_active_cntx, cntx)) + goto switch_mm_fast; + + raw_spin_lock_irqsave(&context_lock, flags); + + /* Check that our ASID belongs to the current_version. */ + cntx = atomic_long_read(&mm->context.id); + if ((cntx & ~asid_mask) != atomic_long_read(¤t_version)) { + cntx = __new_context(mm); + atomic_long_set(&mm->context.id, cntx); + } + + if (cpumask_test_and_clear_cpu(cpu, &context_tlb_flush_pending)) + need_flush_tlb = true; + + atomic_long_set(&per_cpu(active_context, cpu), cntx); + + raw_spin_unlock_irqrestore(&context_lock, flags); + +switch_mm_fast: + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | + ((cntx & asid_mask) << SATP_ASID_SHIFT) | + SATP_MODE); + + if (need_flush_tlb) + local_flush_tlb_all(); +} + +static void set_mm_noasid(struct mm_struct *mm) +{ + /* Switch the page table and blindly nuke entire local TLB */ + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); + local_flush_tlb_all(); +} + +static inline void set_mm(struct mm_struct *mm, unsigned int cpu) +{ + if (static_branch_unlikely(&use_asid_allocator)) + set_mm_asid(mm, cpu); + else + set_mm_noasid(mm); +} + +static int asids_init(void) +{ + unsigned long old; + + /* Figure-out number of ASID bits in HW */ + old = csr_read(CSR_SATP); + asid_bits = old | (SATP_ASID_MASK << SATP_ASID_SHIFT); + csr_write(CSR_SATP, asid_bits); + asid_bits = (csr_read(CSR_SATP) >> SATP_ASID_SHIFT) & SATP_ASID_MASK; + asid_bits = fls_long(asid_bits); + csr_write(CSR_SATP, old); + + /* + * In the process of determining number of ASID bits (above) + * we polluted the TLB of current HART so let's do TLB flushed + * to remove unwanted TLB enteries. + */ + local_flush_tlb_all(); + + /* Pre-compute ASID details */ + num_asids = 1 << asid_bits; + asid_mask = num_asids - 1; + + /* + * Use ASID allocator only if number of HW ASIDs are + * at-least twice more than CPUs + */ + if (num_asids > (2 * num_possible_cpus())) { + atomic_long_set(¤t_version, num_asids); + + context_asid_map = kcalloc(BITS_TO_LONGS(num_asids), + sizeof(*context_asid_map), GFP_KERNEL); + if (!context_asid_map) + panic("Failed to allocate bitmap for %lu ASIDs\n", + num_asids); + + __set_bit(0, context_asid_map); + + static_branch_enable(&use_asid_allocator); + + pr_info("ASID allocator using %lu bits (%lu entries)\n", + asid_bits, num_asids); + } else { + pr_info("ASID allocator disabled\n"); + } + + return 0; +} +early_initcall(asids_init); +#else +static inline void set_mm(struct mm_struct *mm, unsigned int cpu) +{ + /* Nothing to do here when there is no MMU */ +} +#endif + /* * When necessary, performs a deferred icache flush for the given MM context, * on the local CPU. RISC-V has no direct mechanism for instruction cache @@ -58,10 +318,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, cpumask_clear_cpu(cpu, mm_cpumask(prev)); cpumask_set_cpu(cpu, mm_cpumask(next)); -#ifdef CONFIG_MMU - csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE); - local_flush_tlb_all(); -#endif + set_mm(next, cpu); flush_icache_deferred(next); } From c060c72ffeb448fbb5864faa1f672ebfe14dd25f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:02 -0800 Subject: [PATCH 174/506] KVM: x86/mmu: Expand collapsible SPTE zap for TDP MMU to ZONE_DEVICE and HugeTLB pages Zap SPTEs that are backed by ZONE_DEVICE pages when zappings SPTEs to rebuild them as huge pages in the TDP MMU. ZONE_DEVICE huge pages are managed differently than "regular" pages and are not compound pages. Likewise, PageTransCompoundMap() will not detect HugeTLB, so switch to PageCompound(). This matches the similar check in kvm_mmu_zap_collapsible_spte. Cc: Ben Gardon Fixes: 14881998566d ("kvm: x86/mmu: Support disabling dirty logging for the tdp MMU") Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 71e100a5670f..6e0741176a2a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1348,7 +1348,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, pfn = spte_to_pfn(iter.old_spte); if (kvm_is_reserved_pfn(pfn) || - !PageTransCompoundMap(pfn_to_page(pfn))) + (!PageCompound(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn))) continue; tdp_mmu_set_spte(kvm, &iter, 0); From 1b6d9d9ed5717157933db77d96bb12884c17ce52 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:04 -0800 Subject: [PATCH 175/506] KVM: x86/mmu: Split out max mapping level calculation to helper Factor out the logic for determining the maximum mapping level given a memslot and a gpa. The helper will be used when zapping collapsible SPTEs when disabling dirty logging, e.g. to avoid zapping SPTEs that can't possibly be rebuilt as hugepages. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 37 ++++++++++++++++++++------------- arch/x86/kvm/mmu/mmu_internal.h | 2 ++ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e507568cd55d..e97ed942178e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2756,8 +2756,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn, struct kvm_memory_slot *slot) +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2776,19 +2776,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + pte = lookup_address_in_mm(kvm->mm, hva, &level); if (unlikely(!pte)) return PG_LEVEL_4K; return level; } +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level) +{ + struct kvm_lpage_info *linfo; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) + break; + } + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + return host_pfn_mapping_level(kvm, gfn, pfn, slot); +} + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -2805,17 +2822,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_huge_page_level); - for ( ; max_level > PG_LEVEL_4K; max_level--) { - linfo = lpage_info_slot(gfn, slot, max_level); - if (!linfo->disallow_lpage) - break; - } - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); + level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); if (level == PG_LEVEL_4K) return level; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 9e38d3c5daad..0b55aa561ec8 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -138,6 +138,8 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); From 0a234f5dd06582e82edec7cf17a0f971c5a4142e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:05 -0800 Subject: [PATCH 176/506] KVM: x86/mmu: Pass the memslot to the rmap callbacks Pass the memslot to the rmap callbacks, it will be used when zapping collapsible SPTEs to verify the memslot is compatible with hugepages before zapping its SPTEs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e97ed942178e..e6f06eac11cb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ -static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1196,7 +1197,8 @@ static bool spte_set_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1260,7 +1262,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_clear_dirty(kvm, rmap_head); + __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; @@ -1325,7 +1327,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1345,7 +1348,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmap_head); + return kvm_zap_rmapp(kvm, rmap_head, slot); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -5189,7 +5192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool @@ -5203,7 +5207,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + flush |= fn(kvm, iterator.rmap, memslot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { @@ -5492,7 +5496,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } @@ -5526,7 +5531,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; From 9eba50f8d7fcb61774f160890f98239fa3ab68a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:06 -0800 Subject: [PATCH 177/506] KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs When zapping SPTEs in order to rebuild them as huge pages, use the new helper that computes the max mapping level to detect whether or not a SPTE should be zapped. Doing so avoids zapping SPTEs that can't possibly be rebuilt as huge pages, e.g. due to hardware constraints, memslot alignment, etc... This also avoids zapping SPTEs that are still large, e.g. if migration was canceled before write-protected huge pages were shattered to enable dirty logging. Note, such pages are still write-protected at this time, i.e. a page fault VM-Exit will still occur. This will hopefully be addressed in a future patch. Sadly, TDP MMU loses its const on the memslot, but that's a pervasive problem that's been around for quite some time. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++------ arch/x86/kvm/mmu/tdp_mmu.h | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e6f06eac11cb..4f2dfd59f9a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5553,8 +5553,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - (kvm_is_zone_device_pfn(pfn) || - PageCompound(pfn_to_page(pfn)))) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + pfn, PG_LEVEL_NUM)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -5574,12 +5574,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ + struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + write_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6e0741176a2a..f8fa1f64e10d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1328,8 +1328,10 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) + struct kvm_memory_slot *slot) { + gfn_t start = slot->base_gfn; + gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; bool spte_set = false; @@ -1348,8 +1350,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, pfn = spte_to_pfn(iter.old_spte); if (kvm_is_reserved_pfn(pfn) || - (!PageCompound(pfn_to_page(pfn)) && - !kvm_is_zone_device_pfn(pfn))) + iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, + pfn, PG_LEVEL_NUM)) continue; tdp_mmu_set_spte(kvm, &iter, 0); @@ -1367,7 +1369,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, * be replaced by large mappings, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) + struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; int root_as_id; @@ -1377,8 +1379,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, if (root_as_id != slot->as_id) continue; - zap_collapsible_spte_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); + zap_collapsible_spte_range(kvm, root, slot); } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index b4b65e3699b3..d31c5ed81a18 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -35,7 +35,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); + struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); From c3bb9a20834ffe72d3031afe460ff03d3b3b6e90 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:07 -0800 Subject: [PATCH 178/506] KVM: nVMX: Disable PML in hardware when running L2 Unconditionally disable PML in vmcs02, KVM emulates PML purely in the MMU, e.g. vmx_flush_pml_buffer() doesn't even try to copy the L2 GPAs from vmcs02's buffer to vmcs12. At best, enabling PML is a nop. At worst, it will cause vmx_flush_pml_buffer() to record bogus GFNs in the dirty logs. Initialize vmcs02.GUEST_PML_INDEX such that PML writes would trigger VM-Exit if PML was somehow enabled, skip flushing the buffer for guest mode since the index is bogus, and freak out if a PML full exit occurs when L2 is active. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++-------------- arch/x86/kvm/vmx/vmx.c | 12 ++++++++++-- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 066156ff9c7c..486c7ac6d813 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2167,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* - * The PML address never changes, so it is constant in vmcs02. - * Conceptually we want to copy the PML index from vmcs01 here, - * and then back to vmcs01 on nested vmexit. But since we flush - * the log and reset GUEST_PML_INDEX on each vmexit, the PML - * index is also effectively constant in vmcs02. + * PML is emulated for L2, but never enabled in hardware as the MMU + * handles A/D emulation. Disabling PML for L2 also avoids having to + * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(PML_ADDRESS, 0); + vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) @@ -2210,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - u32 exec_control, vmcs12_exec_ctrl; + u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) @@ -2284,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + /* PML is emulated and never enabled in hardware for L2. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; @@ -5790,7 +5788,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ + /* + * PML is emulated for an L1 VMM and should never be enabled in + * vmcs02, always "handle" PML_FULL by exiting to userspace. + */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5ddaf7e4f601..559a0f16263e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5966,9 +5966,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. + * flushed already. Note, PML is never enabled in hardware while + * running L2. */ - if (enable_pml) + if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* @@ -5984,6 +5985,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return handle_invalid_guest_state(vcpu); if (is_guest_mode(vcpu)) { + /* + * PML is never enabled when running L2, bail immediately if a + * PML full exit occurs as something is horribly wrong. + */ + if (exit_reason.basic == EXIT_REASON_PML_FULL) + goto unexpected_vmexit; + /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC From 2855f98265dc579bd2becb79ce0156d08e0df813 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:08 -0800 Subject: [PATCH 179/506] KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect() Expand the comment about need to use write-protection for nested EPT when PML is enabled to clarify that the tagging is a nop when PML is _not_ enabled. Without the clarification, omitting the PML check looks wrong at first^Wfifth glance. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 0b55aa561ec8..72b0928f2b2d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -84,7 +84,10 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) * When using the EPT page-modification log, the GPAs in the log * would come from L2 rather than L1. Therefore, we need to rely * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. + * PML, since writes now result in a vmexit. Note, this helper will + * tag SPTEs as needing write-protection even if PML is disabled or + * unsupported, but that's ok because the tag is consumed if and only + * if PML is enabled. Omit the PML check to save a few uops. */ return vcpu->arch.mmu == &vcpu->arch.guest_mmu; } From 6dd03800b1afe4d3b6f26b0d20f2e1ecebf32b29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:09 -0800 Subject: [PATCH 180/506] KVM: x86/mmu: Make dirty log size hook (PML) a value, not a function Store the vendor-specific dirty log size in a variable, there's no need to wrap it in a function since the value is constant after hardware_setup() runs. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 - arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 5 +---- arch/x86/kvm/vmx/vmx.c | 9 ++------- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 355a2ab8fc09..28c07cc01474 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -97,7 +97,6 @@ KVM_X86_OP_NULL(slot_enable_log_dirty) KVM_X86_OP_NULL(slot_disable_log_dirty) KVM_X86_OP_NULL(flush_log_dirty) KVM_X86_OP_NULL(enable_log_dirty_pt_masked) -KVM_X86_OP_NULL(cpu_dirty_log_size) KVM_X86_OP_NULL(pre_block) KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 84499aad01a4..fb59933610d9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1294,7 +1294,7 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*cpu_dirty_log_size)(void); + int cpu_dirty_log_size; /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4f2dfd59f9a2..0ed52397c274 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1294,10 +1294,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_cpu_dirty_log_size(void) { - if (kvm_x86_ops.cpu_dirty_log_size) - return static_call(kvm_x86_cpu_dirty_log_size)(); - - return 0; + return kvm_x86_ops.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 559a0f16263e..02456ab8fd0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7642,11 +7642,6 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -static int vmx_cpu_dirty_log_size(void) -{ - return enable_pml ? PML_ENTITY_NUM : 0; -} - static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_unsetup = hardware_unsetup, @@ -7750,6 +7745,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .cpu_dirty_log_size = PML_ENTITY_NUM, .pre_block = vmx_pre_block, .post_block = vmx_post_block, @@ -7777,7 +7773,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .msr_filter_changed = vmx_msr_filter_changed, .complete_emulated_msr = kvm_complete_insn_gp, - .cpu_dirty_log_size = vmx_cpu_dirty_log_size, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; @@ -7899,7 +7894,7 @@ static __init int hardware_setup(void) vmx_x86_ops.slot_disable_log_dirty = NULL; vmx_x86_ops.flush_log_dirty = NULL; vmx_x86_ops.enable_log_dirty_pt_masked = NULL; - vmx_x86_ops.cpu_dirty_log_size = NULL; + vmx_x86_ops.cpu_dirty_log_size = 0; } if (!cpu_has_vmx_preemption_timer()) From a018eba53870aa30e5e57465771cb209680f20c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:10 -0800 Subject: [PATCH 181/506] KVM: x86: Move MMU's PML logic to common code Drop the facade of KVM's PML logic being vendor specific and move the bits that aren't truly VMX specific into common x86 code. The MMU logic for dealing with PML is tightly coupled to the feature and to VMX's implementation, bouncing through kvm_x86_ops obfuscates the code without providing any meaningful separation of concerns or encapsulation. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 4 --- arch/x86/include/asm/kvm_host.h | 27 ++------------- arch/x86/kvm/mmu/mmu.c | 16 +++------ arch/x86/kvm/vmx/vmx.c | 55 +----------------------------- arch/x86/kvm/x86.c | 22 ++++++++---- 5 files changed, 24 insertions(+), 100 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 28c07cc01474..90affdb2cbbc 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -93,10 +93,6 @@ KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) -KVM_X86_OP_NULL(slot_enable_log_dirty) -KVM_X86_OP_NULL(slot_disable_log_dirty) -KVM_X86_OP_NULL(flush_log_dirty) -KVM_X86_OP_NULL(enable_log_dirty_pt_masked) KVM_X86_OP_NULL(pre_block) KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fb59933610d9..5cf382ec48b0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1271,29 +1271,9 @@ struct kvm_x86_ops { void (*sched_in)(struct kvm_vcpu *kvm, int cpu); /* - * Arch-specific dirty logging hooks. These hooks are only supposed to - * be valid if the specific arch has hardware-accelerated dirty logging - * mechanism. Currently only for PML on VMX. - * - * - slot_enable_log_dirty: - * called when enabling log dirty mode for the slot. - * - slot_disable_log_dirty: - * called when disabling log dirty mode for the slot. - * also called when slot is created with log dirty disabled. - * - flush_log_dirty: - * called before reporting dirty_bitmap to userspace. - * - enable_log_dirty_pt_masked: - * called when reenabling log dirty for the GFNs in the mask after - * corresponding bits are cleared in slot->dirty_bitmap. + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero + * value indicates CPU dirty logging is unsupported or disabled. */ - void (*slot_enable_log_dirty)(struct kvm *kvm, - struct kvm_memory_slot *slot); - void (*slot_disable_log_dirty)(struct kvm *kvm, - struct kvm_memory_slot *slot); - void (*flush_log_dirty)(struct kvm *kvm); - void (*enable_log_dirty_pt_masked)(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t offset, unsigned long mask); int cpu_dirty_log_size; /* pmu operations of sub-arch */ @@ -1439,9 +1419,6 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); -void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0ed52397c274..f208697781fc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1250,9 +1250,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, * * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. */ -void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) +static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; @@ -1268,7 +1268,6 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, mask &= mask - 1; } } -EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); /** * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected @@ -1284,10 +1283,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - if (kvm_x86_ops.enable_log_dirty_pt_masked) - static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot, - gfn_offset, - mask); + if (kvm_x86_ops.cpu_dirty_log_size) + kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } @@ -5616,7 +5613,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -5633,7 +5629,6 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -5649,7 +5644,6 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); void kvm_mmu_zap_all(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 02456ab8fd0c..eb207ffb7873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5766,24 +5766,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } -/* - * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. - * Called before reporting dirty_bitmap to userspace. - */ -static void kvm_flush_pml_buffers(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - /* - * We only need to kick vcpu out of guest mode here, as PML buffer - * is flushed at beginning of all VMEXITs, and it's obvious that only - * vcpus running in guest are possible to have unflushed GPAs in PML - * buffer. - */ - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_kick(vcpu); -} - static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", @@ -7509,32 +7491,6 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } -static void vmx_slot_enable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); -} - -static void vmx_slot_disable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_mmu_slot_set_dirty(kvm, slot); -} - -static void vmx_flush_log_dirty(struct kvm *kvm) -{ - kvm_flush_pml_buffers(kvm); -} - -static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *memslot, - gfn_t offset, unsigned long mask) -{ - kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); -} - static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7741,10 +7697,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .sched_in = vmx_sched_in, - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, - .flush_log_dirty = vmx_flush_log_dirty, - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, .cpu_dirty_log_size = PML_ENTITY_NUM, .pre_block = vmx_pre_block, @@ -7889,13 +7841,8 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) { - vmx_x86_ops.slot_enable_log_dirty = NULL; - vmx_x86_ops.slot_disable_log_dirty = NULL; - vmx_x86_ops.flush_log_dirty = NULL; - vmx_x86_ops.enable_log_dirty_pt_masked = NULL; + if (!enable_pml) vmx_x86_ops.cpu_dirty_log_size = 0; - } if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3fa140383f5d..e89fe98a0099 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5214,10 +5214,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { + /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. + * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called + * before reporting dirty_bitmap to userspace. KVM flushes the buffers + * on all VM-Exits, thus we only need to kick running vCPUs to force a + * VM-Exit. */ - static_call_cond(kvm_x86_flush_log_dirty)(kvm); + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -10809,8 +10817,10 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * is enabled the D-bit or the W-bit will be cleared. */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops.slot_enable_log_dirty) { - static_call(kvm_x86_slot_enable_log_dirty)(kvm, new); + if (kvm_x86_ops.cpu_dirty_log_size) { + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, new); + kvm_mmu_slot_largepage_remove_write_access(kvm, new); } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? @@ -10826,8 +10836,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ kvm_mmu_slot_remove_write_access(kvm, new, level); } - } else { - static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new); + } else if (kvm_x86_ops.cpu_dirty_log_size) { + kvm_mmu_slot_set_dirty(kvm, new); } } From 52f4607940b18337f01d160aaae346eaac388bf7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:11 -0800 Subject: [PATCH 182/506] KVM: x86: Further clarify the logic and comments for toggling log dirty Add a sanity check in kvm_mmu_slot_apply_flags to assert that the LOG_DIRTY_PAGES flag is indeed being toggled, and explicitly rely on that holding true when zapping collapsible SPTEs. Manipulating the CPU dirty log (PML) and write-protection also relies on this assertion, but that's not obvious in the current code. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e89fe98a0099..c0d22f19aed0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10761,12 +10761,20 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, enum kvm_mr_change change) { /* - * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. - * See comments below. + * Nothing to do for RO slots (which can't be dirtied and can't be made + * writable) or CREATE/MOVE/DELETE of a slot. See comments below. */ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; + /* + * READONLY and non-flags changes were filtered out above, and the only + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty + * logging isn't being toggled on or off. + */ + if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + return; + /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -10784,8 +10792,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot() */ - if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* From a85863c2ec55edcfd11853014b143fc02b8840a9 Mon Sep 17 00:00:00 2001 From: Makarand Sonare Date: Fri, 12 Feb 2021 16:50:12 -0800 Subject: [PATCH 183/506] KVM: VMX: Dynamically enable/disable PML based on memslot dirty logging Currently, if enable_pml=1 PML remains enabled for the entire lifetime of the VM irrespective of whether dirty logging is enable or disabled. When dirty logging is disabled, all the pages of the VM are manually marked dirty, so that PML is effectively non-operational. Setting the dirty bits is an expensive operation which can cause severe MMU lock contention in a performance sensitive path when dirty logging is disabled after a failed or canceled live migration. Manually setting dirty bits also fails to prevent PML activity if some code path clears dirty bits, which can incur unnecessary VM-Exits. In order to avoid this extra overhead, dynamically enable/disable PML when dirty logging gets turned on/off for the first/last memslot. Signed-off-by: Makarand Sonare Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/vmx/nested.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 28 +++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 2 ++ arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++---- 6 files changed, 70 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 90affdb2cbbc..323641097f63 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -93,6 +93,7 @@ KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) +KVM_X86_OP_NULL(update_cpu_dirty_logging) KVM_X86_OP_NULL(pre_block) KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5cf382ec48b0..ffcfa84c969d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -89,6 +89,8 @@ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) +#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ + KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1007,6 +1009,7 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; @@ -1275,6 +1278,7 @@ struct kvm_x86_ops { * value indicates CPU dirty logging is unsupported or disabled. */ int cpu_dirty_log_size; + void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 486c7ac6d813..bcca0b80e0d0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4490,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_set_virtual_apic_mode(vcpu); } + if (vmx->nested.update_vmcs01_cpu_dirty_logging) { + vmx->nested.update_vmcs01_cpu_dirty_logging = false; + vmx_update_cpu_dirty_logging(vcpu); + } + /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_clean(vmx->nested.apic_access_page); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index eb207ffb7873..50810d471462 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4277,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - if (!enable_pml) + /* + * PML is enabled/disabled when dirty logging of memsmlots changes, but + * it needs to be set here when dirty logging is already active, e.g. + * if this vCPU was created after dirty logging was enabled. + */ + if (!vcpu->kvm->arch.cpu_dirty_logging_count) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -7491,6 +7496,26 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_cpu_dirty_logging = true; + return; + } + + /* + * Note, cpu_dirty_logging_count can be changed concurrent with this + * code, but in that case another update request will be made and so + * the guest will never run with a stale PML value. + */ + if (vcpu->kvm->arch.cpu_dirty_logging_count) + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); + else + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); +} + static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7698,6 +7723,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, .pre_block = vmx_pre_block, .post_block = vmx_post_block, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 12c53d05a902..89da5e1251f1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -165,6 +165,7 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; + bool update_vmcs01_cpu_dirty_logging; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -393,6 +394,7 @@ int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value); +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); static inline u8 vmx_get_rvi(void) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0d22f19aed0..b9a8c8af9713 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8987,6 +8987,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_check_async_pf_completion(vcpu); if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) static_call(kvm_x86_msr_filter_changed)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) + static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -10755,14 +10758,38 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } + +static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) +{ + struct kvm_arch *ka = &kvm->arch; + + if (!kvm_x86_ops.cpu_dirty_log_size) + return; + + if ((enable && ++ka->cpu_dirty_logging_count == 1) || + (!enable && --ka->cpu_dirty_logging_count == 0)) + kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); + + WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); +} + static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { + bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; + /* - * Nothing to do for RO slots (which can't be dirtied and can't be made - * writable) or CREATE/MOVE/DELETE of a slot. See comments below. + * Update CPU dirty logging if dirty logging is being toggled. This + * applies to all operations. + */ + if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); + + /* + * Nothing more to do for RO slots (which can't be dirtied and can't be + * made writable) or CREATE/MOVE/DELETE of a slot. See comments below. */ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; @@ -10792,7 +10819,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot() */ - if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if (!log_dirty_pages) kvm_mmu_zap_collapsible_sptes(kvm, new); /* @@ -10823,7 +10850,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * initial-all-set state. Otherwise, depending on whether pml * is enabled the D-bit or the W-bit will be cleared. */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (log_dirty_pages) { if (kvm_x86_ops.cpu_dirty_log_size) { if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_slot_leaf_clear_dirty(kvm, new); From b6e16ae5d99fa39f0cb3d3f4558c2cbf44af38f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:13 -0800 Subject: [PATCH 184/506] KVM: x86/mmu: Don't set dirty bits when disabling dirty logging w/ PML Stop setting dirty bits for MMU pages when dirty logging is disabled for a memslot, as PML is now completely disabled when there are no memslots with dirty logging enabled. This means that spurious PML entries will be created for memslots with dirty logging disabled if at least one other memslot has dirty logging enabled. However, spurious PML entries are already possible since dirty bits are set only when a dirty logging is turned off, i.e. memslots that are never dirty logged will have dirty bits cleared. In the end, it's faster overall to eat a few spurious PML entries in the window where dirty logging is being disabled across all memslots. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 - arch/x86/kvm/mmu/mmu.c | 45 ----------------- arch/x86/kvm/mmu/tdp_mmu.c | 54 -------------------- arch/x86/kvm/mmu/tdp_mmu.h | 1 - arch/x86/kvm/x86.c | 87 ++++++++++++++------------------- 5 files changed, 36 insertions(+), 153 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffcfa84c969d..c15d6de8c457 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1421,8 +1421,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); -void kvm_mmu_slot_set_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f208697781fc..562bff762c83 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1181,36 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -static bool spte_set_dirty(u64 *sptep) -{ - u64 spte = *sptep; - - rmap_printk("spte %p %llx\n", sptep, *sptep); - - /* - * Similar to the !kvm_x86_ops.slot_disable_log_dirty case, - * do not bother adding back write access to pages marked - * SPTE_AD_WRPROT_ONLY_MASK. - */ - spte |= shadow_dirty_mask; - - return mmu_spte_update(sptep, spte); -} - -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) -{ - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_set_dirty(sptep); - - return flush; -} - /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -5630,21 +5600,6 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -void kvm_mmu_slot_set_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - write_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); - write_unlock(&kvm->mmu_lock); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); -} - void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f8fa1f64e10d..c926c6b899a1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1268,60 +1268,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/* - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is - * only used for PML, and so will involve setting the dirty bit on each SPTE. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. - */ -static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) -{ - struct tdp_iter iter; - u64 new_spte; - bool spte_set = false; - - rcu_read_lock(); - - tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) - continue; - - if (!is_shadow_present_pte(iter.old_spte) || - iter.old_spte & shadow_dirty_mask) - continue; - - new_spte = iter.old_spte | shadow_dirty_mask; - - tdp_mmu_set_spte(kvm, &iter, new_spte); - spte_set = true; - } - - rcu_read_unlock(); - return spte_set; -} - -/* - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is - * only used for PML, and so will involve setting the dirty bit on each SPTE. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. - */ -bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - struct kvm_mmu_page *root; - int root_as_id; - bool spte_set = false; - - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - - spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - } - return spte_set; -} - /* * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d31c5ed81a18..3b761c111bff 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -33,7 +33,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *slot); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9a8c8af9713..dca2c3333ef2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10789,7 +10789,18 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, /* * Nothing more to do for RO slots (which can't be dirtied and can't be - * made writable) or CREATE/MOVE/DELETE of a slot. See comments below. + * made writable) or CREATE/MOVE/DELETE of a slot. + * + * For a memslot with dirty logging disabled: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + * + * For a memslot with dirty logging enabled: + * CREATE: No shadow pages exist, thus nothing to write-protect + * and no dirty bits to clear. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot(). */ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; @@ -10802,55 +10813,31 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) return; - /* - * Dirty logging tracks sptes in 4k granularity, meaning that large - * sptes have to be split. If live migration is successful, the guest - * in the source machine will be destroyed and large sptes will be - * created in the destination. However, if the guest continues to run - * in the source machine (for example if live migration fails), small - * sptes will remain around and cause bad performance. - * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. - * - * There is no need to do this in any of the following cases: - * CREATE: No dirty mappings will already exist. - * MOVE/DELETE: The old mappings will already have been cleaned up by - * kvm_arch_flush_shadow_memslot() - */ - if (!log_dirty_pages) + if (!log_dirty_pages) { + /* + * Dirty logging tracks sptes in 4k granularity, meaning that + * large sptes have to be split. If live migration succeeds, + * the guest in the source machine will be destroyed and large + * sptes will be created in the destination. However, if the + * guest continues to run in the source machine (for example if + * live migration fails), small sptes will remain around and + * cause bad performance. + * + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. + */ kvm_mmu_zap_collapsible_sptes(kvm, new); - - /* - * Enable or disable dirty logging for the slot. - * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old - * slot have been zapped so no dirty logging updates are needed for - * the old slot. - * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible - * any mappings that might be created in it will consume the - * properties of the new slot and do not need to be updated here. - * - * When PML is enabled, the kvm_x86_ops dirty logging hooks are - * called to enable/disable dirty logging. - * - * When disabling dirty logging with PML enabled, the D-bit is set - * for sptes in the slot in order to prevent unnecessary GPA - * logging in the PML buffer (and potential PML buffer full VMEXIT). - * This guarantees leaving PML enabled for the guest's lifetime - * won't have any additional overhead from PML when the guest is - * running with dirty logging disabled. - * - * When enabling dirty logging, large sptes are write-protected - * so they can be split on first write. New large sptes cannot - * be created for this slot until the end of the logging. - * See the comments in fast_page_fault(). - * For small sptes, nothing is done if the dirty log is in the - * initial-all-set state. Otherwise, depending on whether pml - * is enabled the D-bit or the W-bit will be cleared. - */ - if (log_dirty_pages) { + } else { + /* + * Large sptes are write-protected so they can be split on first + * write. New large sptes cannot be created for this slot until + * the end of the logging. See the comments in fast_page_fault(). + * + * For small sptes, nothing is done if the dirty log is in the + * initial-all-set state. Otherwise, depending on whether pml + * is enabled the D-bit or the W-bit will be cleared. + */ if (kvm_x86_ops.cpu_dirty_log_size) { if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_slot_leaf_clear_dirty(kvm, new); @@ -10870,8 +10857,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ kvm_mmu_slot_remove_write_access(kvm, new, level); } - } else if (kvm_x86_ops.cpu_dirty_log_size) { - kvm_mmu_slot_set_dirty(kvm, new); } } From a1419f8b5bab477d96a71d1c37da0784fb18dc51 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:14 -0800 Subject: [PATCH 185/506] KVM: x86: Fold "write-protect large" use case into generic write-protect Drop kvm_mmu_slot_largepage_remove_write_access() and refactor its sole caller to use kvm_mmu_slot_remove_write_access(). Remove the now-unused slot_handle_large_level() and slot_handle_all_level() helpers. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 32 -------------------------------- arch/x86/kvm/x86.c | 32 +++++++++++++++++--------------- 2 files changed, 17 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 562bff762c83..e2178e0526d9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5204,22 +5204,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, lock_flush_tlb); } -static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) @@ -5584,22 +5568,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - write_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); - write_unlock(&kvm->mmu_lock); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); -} - void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dca2c3333ef2..1d2bc89431a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10829,24 +10829,25 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ kvm_mmu_zap_collapsible_sptes(kvm, new); } else { - /* - * Large sptes are write-protected so they can be split on first - * write. New large sptes cannot be created for this slot until - * the end of the logging. See the comments in fast_page_fault(). - * - * For small sptes, nothing is done if the dirty log is in the - * initial-all-set state. Otherwise, depending on whether pml - * is enabled the D-bit or the W-bit will be cleared. - */ + /* By default, write-protect everything to log writes. */ + int level = PG_LEVEL_4K; + if (kvm_x86_ops.cpu_dirty_log_size) { + /* + * Clear all dirty bits, unless pages are treated as + * dirty from the get-go. + */ if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_slot_leaf_clear_dirty(kvm, new); - kvm_mmu_slot_largepage_remove_write_access(kvm, new); - } else { - int level = - kvm_dirty_log_manual_protect_and_init_set(kvm) ? - PG_LEVEL_2M : PG_LEVEL_4K; + /* + * Write-protect large pages on write so that dirty + * logging happens at 4k granularity. No need to + * write-protect small SPTEs since write accesses are + * logged by the CPU via dirty bits. + */ + level = PG_LEVEL_2M; + } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { /* * If we're with initial-all-set, we don't need * to write protect any small page because @@ -10855,8 +10856,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * so that the page split can happen lazily on * the first write to the huge page. */ - kvm_mmu_slot_remove_write_access(kvm, new, level); + level = PG_LEVEL_2M; } + kvm_mmu_slot_remove_write_access(kvm, new, level); } } From 96ad91ae4eaff3697b1124b30d28d73de3557a3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:15 -0800 Subject: [PATCH 186/506] KVM: x86/mmu: Remove a variety of unnecessary exports Remove several exports from the MMU that are no longer necessary. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/mmu.c | 35 ++++++++++++++------------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c15d6de8c457..0cf71ff2b2e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1592,7 +1592,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2178e0526d9..6e78d8e51aa2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2466,7 +2466,21 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); + +static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; + + if (vcpu->arch.mmu->direct_map) + return 0; + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + return r; +} static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -3411,7 +3425,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); } -EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) @@ -4977,22 +4990,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, write_unlock(&vcpu->kvm->mmu_lock); } -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->direct_map) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -5091,7 +5088,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->invlpg(vcpu, gva, root_hpa); } } -EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { @@ -5131,7 +5127,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) * for them. */ } -EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, int tdp_huge_page_level) From af0bfab907a011e146304d20d81dddce4e4d62d0 Mon Sep 17 00:00:00 2001 From: Abanoub Sameh Date: Fri, 11 Dec 2020 22:42:08 +0200 Subject: [PATCH 187/506] leds: led-core: Get rid of enum led_brightness This gets rid of enum led_brightness in the main led files, because it is deprecated, and an unsigned int can be used instead. We can get rid of led_brightness completely and patches can also be supplied for the other drivers' files. Signed-off-by: Abanoub Sameh Signed-off-by: Pavel Machek --- drivers/leds/led-class.c | 3 +-- drivers/leds/led-core.c | 20 +++++++------------- drivers/leds/leds.h | 6 ++---- include/linux/leds.h | 12 +++++------- 4 files changed, 15 insertions(+), 26 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 131ca83f5fb3..2e495ff67856 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -145,8 +145,7 @@ static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } -void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, - enum led_brightness brightness) +void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index c4e780bdb385..8eb8054ef9c6 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -39,8 +39,7 @@ const char * const led_colors[LED_COLOR_ID_MAX] = { }; EXPORT_SYMBOL_GPL(led_colors); -static int __led_set_brightness(struct led_classdev *led_cdev, - enum led_brightness value) +static int __led_set_brightness(struct led_classdev *led_cdev, unsigned int value) { if (!led_cdev->brightness_set) return -ENOTSUPP; @@ -50,8 +49,7 @@ static int __led_set_brightness(struct led_classdev *led_cdev, return 0; } -static int __led_set_brightness_blocking(struct led_classdev *led_cdev, - enum led_brightness value) +static int __led_set_brightness_blocking(struct led_classdev *led_cdev, unsigned int value) { if (!led_cdev->brightness_set_blocking) return -ENOTSUPP; @@ -240,8 +238,7 @@ void led_stop_software_blink(struct led_classdev *led_cdev) } EXPORT_SYMBOL_GPL(led_stop_software_blink); -void led_set_brightness(struct led_classdev *led_cdev, - enum led_brightness brightness) +void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness) { /* * If software blink is active, delay brightness setting @@ -253,7 +250,7 @@ void led_set_brightness(struct led_classdev *led_cdev, * work queue task to avoid problems in case we are called * from hard irq context. */ - if (brightness == LED_OFF) { + if (!brightness) { set_bit(LED_BLINK_DISABLE, &led_cdev->work_flags); schedule_work(&led_cdev->set_brightness_work); } else { @@ -268,8 +265,7 @@ void led_set_brightness(struct led_classdev *led_cdev, } EXPORT_SYMBOL_GPL(led_set_brightness); -void led_set_brightness_nopm(struct led_classdev *led_cdev, - enum led_brightness value) +void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value) { /* Use brightness_set op if available, it is guaranteed not to sleep */ if (!__led_set_brightness(led_cdev, value)) @@ -281,8 +277,7 @@ void led_set_brightness_nopm(struct led_classdev *led_cdev, } EXPORT_SYMBOL_GPL(led_set_brightness_nopm); -void led_set_brightness_nosleep(struct led_classdev *led_cdev, - enum led_brightness value) +void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value) { led_cdev->brightness = min(value, led_cdev->max_brightness); @@ -293,8 +288,7 @@ void led_set_brightness_nosleep(struct led_classdev *led_cdev, } EXPORT_SYMBOL_GPL(led_set_brightness_nosleep); -int led_set_brightness_sync(struct led_classdev *led_cdev, - enum led_brightness value) +int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value) { if (led_cdev->blink_delay_on || led_cdev->blink_delay_off) return -EBUSY; diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h index 2d9eb48bbed9..345062ccabda 100644 --- a/drivers/leds/leds.h +++ b/drivers/leds/leds.h @@ -19,10 +19,8 @@ static inline int led_get_brightness(struct led_classdev *led_cdev) void led_init_core(struct led_classdev *led_cdev); void led_stop_software_blink(struct led_classdev *led_cdev); -void led_set_brightness_nopm(struct led_classdev *led_cdev, - enum led_brightness value); -void led_set_brightness_nosleep(struct led_classdev *led_cdev, - enum led_brightness value); +void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value); +void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value); ssize_t led_trigger_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t pos, size_t count); diff --git a/include/linux/leds.h b/include/linux/leds.h index 6a8d6409c993..329fd914cf24 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -63,8 +63,8 @@ struct led_hw_trigger_type { struct led_classdev { const char *name; - enum led_brightness brightness; - enum led_brightness max_brightness; + unsigned int brightness; + unsigned int max_brightness; int flags; /* Lower 16 bits reflect status */ @@ -253,8 +253,7 @@ void led_blink_set_oneshot(struct led_classdev *led_cdev, * software blink timer that implements blinking when the * hardware doesn't. This function is guaranteed not to sleep. */ -void led_set_brightness(struct led_classdev *led_cdev, - enum led_brightness brightness); +void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness); /** * led_set_brightness_sync - set LED brightness synchronously @@ -267,8 +266,7 @@ void led_set_brightness(struct led_classdev *led_cdev, * * Returns: 0 on success or negative error value on failure */ -int led_set_brightness_sync(struct led_classdev *led_cdev, - enum led_brightness value); +int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value); /** * led_update_brightness - update LED brightness @@ -565,7 +563,7 @@ static inline void ledtrig_cpu(enum cpu_led_event evt) #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED void led_classdev_notify_brightness_hw_changed( - struct led_classdev *led_cdev, enum led_brightness brightness); + struct led_classdev *led_cdev, unsigned int brightness); #else static inline void led_classdev_notify_brightness_hw_changed( struct led_classdev *led_cdev, enum led_brightness brightness) { } From b113a7f1981062442b5a5318b6cf6b7ad4097b45 Mon Sep 17 00:00:00 2001 From: Amireddy Mallikarjuna reddy Date: Thu, 10 Dec 2020 17:12:11 +0800 Subject: [PATCH 188/506] dt-bindings: leds: Add bindings for Intel LGM SoC Add DT bindings YAML schema for SSO controller driver of Lightning Mountain (LGM) SoC. Signed-off-by: Amireddy Mallikarjuna reddy Signed-off-by: Pavel Machek Reviewed-by: Rob Herring --- .../devicetree/bindings/leds/leds-lgm.yaml | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 Documentation/devicetree/bindings/leds/leds-lgm.yaml diff --git a/Documentation/devicetree/bindings/leds/leds-lgm.yaml b/Documentation/devicetree/bindings/leds/leds-lgm.yaml new file mode 100644 index 000000000000..32bbf146c01d --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-lgm.yaml @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/leds-lgm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Intel Lightning Mountain (LGM) SoC LED Serial Shift Output (SSO) Controller driver + +maintainers: + - Zhu, Yi Xin + - Amireddy Mallikarjuna reddy + +properties: + compatible: + const: intel,lgm-ssoled + + gpio-controller: true + + '#gpio-cells': + const: 2 + + ngpios: + minimum: 0 + maximum: 32 + description: + Number of GPIOs this controller provides. + + intel,sso-update-rate-hz: + description: + Blink frequency for SOUTs in Hz. + + led-controller: + type: object + description: + This sub-node must contain a sub-node for each leds. + + additionalProperties: false + + patternProperties: + "^led@[0-23]$": + type: object + + properties: + reg: + description: Index of the LED. + minimum: 0 + maximum: 2 + + intel,sso-hw-trigger: + type: boolean + description: This property indicates Hardware driven/control LED. + + intel,sso-hw-blink: + type: boolean + description: This property indicates Enable LED blink by Hardware. + + intel,sso-blink-rate-hz: + description: LED HW blink frequency. + + retain-state-suspended: + type: boolean + description: The suspend state of LED can be retained. + + retain-state-shutdown: + type: boolean + description: Retain the state of the LED on shutdown. + +required: + - compatible + - reg + - clocks + - clock-names + - "#gpio-cells" + - gpio-controller + +additionalProperties: false + +examples: + - | + #include + #include + + ssogpio: ssogpio@e0d40000 { + compatible = "intel,sso-led"; + reg = <0xE0D40000 0x2E4>; + gpio-controller; + #gpio-cells = <2>; + ngpios = <32>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_ledc>; + clocks = <&cgu0 LGM_GCLK_LEDC0>, <&afeclk>; + clock-names = "sso", "fpid"; + intel,sso-update-rate-hz = <250000>; + + led-controller { + #address-cells = <1>; + #size-cells = <0>; + + led@0 { + reg = <0>; + function = "gphy"; + color = ; + led-gpio = <&ssogpio 0 0>; + }; + + led@23 { + reg = <23>; + function = LED_FUNCTION_POWER; + color = ; + led-gpio = <&ssogpio 23 0>; + }; + }; + }; From c3987cd2bca34ddfec69027acedb2fae5ffcf7a0 Mon Sep 17 00:00:00 2001 From: Amireddy Mallikarjuna reddy Date: Thu, 10 Dec 2020 17:12:12 +0800 Subject: [PATCH 189/506] leds: lgm: Add LED controller driver for LGM SoC Parallel to serial conversion, which is also called SSO controller, can drive external shift register for LED outputs, reset or general purpose outputs. This driver enables LED support for Serial Shift Output Controller (SSO). Signed-off-by: Amireddy Mallikarjuna reddy Signed-off-by: Pavel Machek --- drivers/leds/Kconfig | 3 + drivers/leds/Makefile | 3 + drivers/leds/blink/Kconfig | 20 + drivers/leds/blink/Makefile | 2 + drivers/leds/blink/leds-lgm-sso.c | 888 ++++++++++++++++++++++++++++++ 5 files changed, 916 insertions(+) create mode 100644 drivers/leds/blink/Kconfig create mode 100644 drivers/leds/blink/Makefile create mode 100644 drivers/leds/blink/leds-lgm-sso.c diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 849d3c5f908e..d123fe2d9465 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -931,4 +931,7 @@ config LEDS_ACER_A500 comment "LED Triggers" source "drivers/leds/trigger/Kconfig" +comment "LED Blink" +source "drivers/leds/blink/Kconfig" + endif # NEW_LEDS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index 73e603e1727e..de47e79ecb1f 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -105,3 +105,6 @@ obj-$(CONFIG_LEDS_USER) += uleds.o # LED Triggers obj-$(CONFIG_LEDS_TRIGGERS) += trigger/ + +# LED Blink +obj-$(CONFIG_LEDS_BLINK) += blink/ diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig new file mode 100644 index 000000000000..265b53476a80 --- /dev/null +++ b/drivers/leds/blink/Kconfig @@ -0,0 +1,20 @@ +menuconfig LEDS_BLINK + bool "LED Blink support" + depends on LEDS_CLASS + help + This option enables blink support for the leds class. + If unsure, say Y. + +if LEDS_BLINK + +config LEDS_BLINK_LGM + tristate "LED support for Intel LGM SoC series" + depends on LEDS_CLASS + depends on MFD_SYSCON + depends on OF + help + Parallel to serial conversion, which is also called SSO controller, + can drive external shift register for LED outputs. + This enables LED support for Serial Shift Output controller(SSO). + +endif # LEDS_BLINK diff --git a/drivers/leds/blink/Makefile b/drivers/leds/blink/Makefile new file mode 100644 index 000000000000..2fa6c7b7b67e --- /dev/null +++ b/drivers/leds/blink/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_LEDS_BLINK_LGM) += leds-lgm-sso.o diff --git a/drivers/leds/blink/leds-lgm-sso.c b/drivers/leds/blink/leds-lgm-sso.c new file mode 100644 index 000000000000..7d5c9ca007d6 --- /dev/null +++ b/drivers/leds/blink/leds-lgm-sso.c @@ -0,0 +1,888 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Lightning Mountain SoC LED Serial Shift Output Controller driver + * + * Copyright (c) 2020 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SSO_DEV_NAME "lgm-sso" + +#define LED_BLINK_H8_0 0x0 +#define LED_BLINK_H8_1 0x4 +#define GET_FREQ_OFFSET(pin, src) (((pin) * 6) + ((src) * 2)) +#define GET_SRC_OFFSET(pinc) (((pin) * 6) + 4) + +#define DUTY_CYCLE(x) (0x8 + ((x) * 4)) +#define SSO_CON0 0x2B0 +#define SSO_CON0_RZFL BIT(26) +#define SSO_CON0_BLINK_R BIT(30) +#define SSO_CON0_SWU BIT(31) + +#define SSO_CON1 0x2B4 +#define SSO_CON1_FCDSC GENMASK(21, 20) /* Fixed Divider Shift Clock */ +#define SSO_CON1_FPID GENMASK(24, 23) +#define SSO_CON1_GPTD GENMASK(26, 25) +#define SSO_CON1_US GENMASK(31, 30) + +#define SSO_CPU 0x2B8 +#define SSO_CON2 0x2C4 +#define SSO_CON3 0x2C8 + +/* Driver MACRO */ +#define MAX_PIN_NUM_PER_BANK SZ_32 +#define MAX_GROUP_NUM SZ_4 +#define PINS_PER_GROUP SZ_8 +#define FPID_FREQ_RANK_MAX SZ_4 +#define SSO_LED_MAX_NUM SZ_32 +#define MAX_FREQ_RANK 10 +#define DEF_GPTC_CLK_RATE 200000000 +#define SSO_DEF_BRIGHTNESS LED_HALF +#define DATA_CLK_EDGE 0 /* 0-rising, 1-falling */ + +static const u32 freq_div_tbl[] = {4000, 2000, 1000, 800}; +static const int freq_tbl[] = {2, 4, 8, 10, 50000, 100000, 200000, 250000}; +static const int shift_clk_freq_tbl[] = {25000000, 12500000, 6250000, 3125000}; + +/* + * Update Source to update the SOUTs + * SW - Software has to update the SWU bit + * GPTC - General Purpose timer is used as clock source + * FPID - Divided FSC clock (FPID) is used as clock source + */ +enum { + US_SW = 0, + US_GPTC = 1, + US_FPID = 2 +}; + +enum { + MAX_FPID_FREQ_RANK = 5, /* 1 to 4 */ + MAX_GPTC_FREQ_RANK = 9, /* 5 to 8 */ + MAX_GPTC_HS_FREQ_RANK = 10, /* 9 to 10 */ +}; + +enum { + LED_GRP0_PIN_MAX = 24, + LED_GRP1_PIN_MAX = 29, + LED_GRP2_PIN_MAX = 32, +}; + +enum { + LED_GRP0_0_23, + LED_GRP1_24_28, + LED_GRP2_29_31, + LED_GROUP_MAX, +}; + +enum { + CLK_SRC_FPID = 0, + CLK_SRC_GPTC = 1, + CLK_SRC_GPTC_HS = 2, +}; + +struct sso_led_priv; + +struct sso_led_desc { + const char *name; + const char *default_trigger; + unsigned int brightness; + unsigned int blink_rate; + unsigned int retain_state_suspended:1; + unsigned int retain_state_shutdown:1; + unsigned int panic_indicator:1; + unsigned int hw_blink:1; + unsigned int hw_trig:1; + unsigned int blinking:1; + int freq_idx; + u32 pin; +}; + +struct sso_led { + struct list_head list; + struct led_classdev cdev; + struct gpio_desc *gpiod; + struct sso_led_desc desc; + struct sso_led_priv *priv; +}; + +struct sso_gpio { + struct gpio_chip chip; + int shift_clk_freq; + int edge; + int freq; + u32 pins; + u32 alloc_bitmap; +}; + +struct sso_led_priv { + struct regmap *mmap; + struct device *dev; + struct platform_device *pdev; + struct clk *gclk; + struct clk *fpid_clk; + u32 fpid_clkrate; + u32 gptc_clkrate; + u32 freq[MAX_FREQ_RANK]; + struct list_head led_list; + struct sso_gpio gpio; +}; + +static int sso_get_blink_rate_idx(struct sso_led_priv *priv, u32 rate) +{ + int i; + + for (i = 0; i < MAX_FREQ_RANK; i++) { + if (rate <= priv->freq[i]) + return i; + } + + return -1; +} + +static unsigned int sso_led_pin_to_group(u32 pin) +{ + if (pin < LED_GRP0_PIN_MAX) + return LED_GRP0_0_23; + else if (pin < LED_GRP1_PIN_MAX) + return LED_GRP1_24_28; + else + return LED_GRP2_29_31; +} + +static u32 sso_led_get_freq_src(int freq_idx) +{ + if (freq_idx < MAX_FPID_FREQ_RANK) + return CLK_SRC_FPID; + else if (freq_idx < MAX_GPTC_FREQ_RANK) + return CLK_SRC_GPTC; + else + return CLK_SRC_GPTC_HS; +} + +static u32 sso_led_pin_blink_off(u32 pin, unsigned int group) +{ + if (group == LED_GRP2_29_31) + return pin - LED_GRP1_PIN_MAX; + else if (group == LED_GRP1_24_28) + return pin - LED_GRP0_PIN_MAX; + else /* led 0 - 23 in led 32 location */ + return SSO_LED_MAX_NUM - LED_GRP1_PIN_MAX; +} + +static struct sso_led +*cdev_to_sso_led_data(struct led_classdev *led_cdev) +{ + return container_of(led_cdev, struct sso_led, cdev); +} + +static void sso_led_freq_set(struct sso_led_priv *priv, u32 pin, int freq_idx) +{ + u32 reg, off, freq_src, val_freq; + u32 low, high, val; + unsigned int group; + + if (!freq_idx) + return; + + group = sso_led_pin_to_group(pin); + freq_src = sso_led_get_freq_src(freq_idx); + off = sso_led_pin_blink_off(pin, group); + + if (group == LED_GRP0_0_23) + return; + else if (group == LED_GRP1_24_28) + reg = LED_BLINK_H8_0; + else + reg = LED_BLINK_H8_1; + + if (freq_src == CLK_SRC_FPID) + val_freq = freq_idx - 1; + else if (freq_src == CLK_SRC_GPTC) + val_freq = freq_idx - MAX_FPID_FREQ_RANK; + + /* set blink rate idx */ + if (freq_src != CLK_SRC_GPTC_HS) { + low = GET_FREQ_OFFSET(off, freq_src); + high = low + 2; + val = val_freq << high; + regmap_update_bits(priv->mmap, reg, GENMASK(high, low), val); + } + + /* select clock source */ + low = GET_SRC_OFFSET(off); + high = low + 2; + val = freq_src << high; + regmap_update_bits(priv->mmap, reg, GENMASK(high, low), val); +} + +static void sso_led_brightness_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct sso_led_priv *priv; + struct sso_led_desc *desc; + struct sso_led *led; + int val; + + led = cdev_to_sso_led_data(led_cdev); + priv = led->priv; + desc = &led->desc; + + desc->brightness = brightness; + regmap_write(priv->mmap, DUTY_CYCLE(desc->pin), brightness); + + if (brightness == LED_OFF) + val = 0; + else + val = 1; + + /* HW blink off */ + if (desc->hw_blink && !val && desc->blinking) { + desc->blinking = 0; + regmap_update_bits(priv->mmap, SSO_CON2, BIT(desc->pin), 0); + } else if (desc->hw_blink && val && !desc->blinking) { + desc->blinking = 1; + regmap_update_bits(priv->mmap, SSO_CON2, BIT(desc->pin), + 1 << desc->pin); + } + + if (!desc->hw_trig && led->gpiod) + gpiod_set_value(led->gpiod, val); +} + +static enum led_brightness sso_led_brightness_get(struct led_classdev *led_cdev) +{ + struct sso_led *led = cdev_to_sso_led_data(led_cdev); + + return (enum led_brightness)led->desc.brightness; +} + +static int +delay_to_freq_idx(struct sso_led *led, unsigned long *delay_on, + unsigned long *delay_off) +{ + struct sso_led_priv *priv = led->priv; + unsigned long delay; + int freq_idx; + u32 freq; + + if (!*delay_on && !*delay_off) { + *delay_on = *delay_off = (1000 / priv->freq[0]) / 2; + return 0; + } + + delay = *delay_on + *delay_off; + freq = 1000 / delay; + + freq_idx = sso_get_blink_rate_idx(priv, freq); + if (freq_idx == -1) + freq_idx = MAX_FREQ_RANK - 1; + + delay = 1000 / priv->freq[freq_idx]; + *delay_on = *delay_off = delay / 2; + + if (!*delay_on) + *delay_on = *delay_off = 1; + + return freq_idx; +} + +static int +sso_led_blink_set(struct led_classdev *led_cdev, unsigned long *delay_on, + unsigned long *delay_off) +{ + struct sso_led_priv *priv; + struct sso_led *led; + int freq_idx; + + led = cdev_to_sso_led_data(led_cdev); + priv = led->priv; + freq_idx = delay_to_freq_idx(led, delay_on, delay_off); + + sso_led_freq_set(priv, led->desc.pin, freq_idx); + regmap_update_bits(priv->mmap, SSO_CON2, BIT(led->desc.pin), + 1 << led->desc.pin); + led->desc.freq_idx = freq_idx; + led->desc.blink_rate = priv->freq[freq_idx]; + led->desc.blinking = 1; + + return 1; +} + +static void sso_led_hw_cfg(struct sso_led_priv *priv, struct sso_led *led) +{ + struct sso_led_desc *desc = &led->desc; + + /* set freq */ + if (desc->hw_blink) { + sso_led_freq_set(priv, desc->pin, desc->freq_idx); + regmap_update_bits(priv->mmap, SSO_CON2, BIT(desc->pin), + 1 << desc->pin); + } + + if (desc->hw_trig) + regmap_update_bits(priv->mmap, SSO_CON3, BIT(desc->pin), + 1 << desc->pin); + + /* set brightness */ + regmap_write(priv->mmap, DUTY_CYCLE(desc->pin), desc->brightness); + + /* enable output */ + if (!desc->hw_trig && desc->brightness) + gpiod_set_value(led->gpiod, 1); +} + +static int sso_create_led(struct sso_led_priv *priv, struct sso_led *led, + struct fwnode_handle *child) +{ + struct sso_led_desc *desc = &led->desc; + struct led_init_data init_data; + int err; + + init_data.fwnode = child; + init_data.devicename = SSO_DEV_NAME; + init_data.default_label = ":"; + + led->cdev.default_trigger = desc->default_trigger; + led->cdev.brightness_set = sso_led_brightness_set; + led->cdev.brightness_get = sso_led_brightness_get; + led->cdev.brightness = desc->brightness; + led->cdev.max_brightness = LED_FULL; + + if (desc->retain_state_shutdown) + led->cdev.flags |= LED_RETAIN_AT_SHUTDOWN; + if (desc->retain_state_suspended) + led->cdev.flags |= LED_CORE_SUSPENDRESUME; + if (desc->panic_indicator) + led->cdev.flags |= LED_PANIC_INDICATOR; + + if (desc->hw_blink) + led->cdev.blink_set = sso_led_blink_set; + + sso_led_hw_cfg(priv, led); + + err = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data); + if (err) + return err; + + list_add(&led->list, &priv->led_list); + + return 0; +} + +static void sso_init_freq(struct sso_led_priv *priv) +{ + int i; + + priv->freq[0] = 0; + for (i = 1; i < MAX_FREQ_RANK; i++) { + if (i < MAX_FPID_FREQ_RANK) { + priv->freq[i] = priv->fpid_clkrate / freq_div_tbl[i - 1]; + } else if (i < MAX_GPTC_FREQ_RANK) { + priv->freq[i] = priv->gptc_clkrate / + freq_div_tbl[i - MAX_FPID_FREQ_RANK]; + } else if (i < MAX_GPTC_HS_FREQ_RANK) { + priv->freq[i] = priv->gptc_clkrate; + } + } +} + +static int sso_gpio_request(struct gpio_chip *chip, unsigned int offset) +{ + struct sso_led_priv *priv = gpiochip_get_data(chip); + + if (priv->gpio.alloc_bitmap & BIT(offset)) + return -EINVAL; + + priv->gpio.alloc_bitmap |= BIT(offset); + regmap_write(priv->mmap, DUTY_CYCLE(offset), 0xFF); + + return 0; +} + +static void sso_gpio_free(struct gpio_chip *chip, unsigned int offset) +{ + struct sso_led_priv *priv = gpiochip_get_data(chip); + + priv->gpio.alloc_bitmap &= ~BIT(offset); + regmap_write(priv->mmap, DUTY_CYCLE(offset), 0x0); +} + +static int sso_gpio_get_dir(struct gpio_chip *chip, unsigned int offset) +{ + return GPIOF_DIR_OUT; +} + +static int +sso_gpio_dir_out(struct gpio_chip *chip, unsigned int offset, int value) +{ + struct sso_led_priv *priv = gpiochip_get_data(chip); + bool bit = !!value; + + regmap_update_bits(priv->mmap, SSO_CPU, BIT(offset), bit << offset); + if (!priv->gpio.freq) + regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_SWU, + SSO_CON0_SWU); + + return 0; +} + +static int sso_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct sso_led_priv *priv = gpiochip_get_data(chip); + u32 reg_val; + + regmap_read(priv->mmap, SSO_CPU, ®_val); + + return !!(reg_val & BIT(offset)); +} + +static void sso_gpio_set(struct gpio_chip *chip, unsigned int offset, int value) +{ + struct sso_led_priv *priv = gpiochip_get_data(chip); + + regmap_update_bits(priv->mmap, SSO_CPU, BIT(offset), value << offset); + if (!priv->gpio.freq) + regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_SWU, + SSO_CON0_SWU); +} + +static int sso_gpio_gc_init(struct device *dev, struct sso_led_priv *priv) +{ + struct gpio_chip *gc = &priv->gpio.chip; + + gc->request = sso_gpio_request; + gc->free = sso_gpio_free; + gc->get_direction = sso_gpio_get_dir; + gc->direction_output = sso_gpio_dir_out; + gc->get = sso_gpio_get; + gc->set = sso_gpio_set; + + gc->label = "lgm-sso"; + gc->base = -1; + /* To exclude pins from control, use "gpio-reserved-ranges" */ + gc->ngpio = priv->gpio.pins; + gc->parent = dev; + gc->owner = THIS_MODULE; + gc->of_node = dev->of_node; + + return devm_gpiochip_add_data(dev, gc, priv); +} + +static int sso_gpio_get_freq_idx(int freq) +{ + int idx; + + for (idx = 0; idx < ARRAY_SIZE(freq_tbl); idx++) { + if (freq <= freq_tbl[idx]) + return idx; + } + + return -1; +} + +static void sso_register_shift_clk(struct sso_led_priv *priv) +{ + int idx, size = ARRAY_SIZE(shift_clk_freq_tbl); + u32 val = 0; + + for (idx = 0; idx < size; idx++) { + if (shift_clk_freq_tbl[idx] <= priv->gpio.shift_clk_freq) { + val = idx; + break; + } + } + + if (idx == size) + dev_warn(priv->dev, "%s: Invalid freq %d\n", + __func__, priv->gpio.shift_clk_freq); + + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_FCDSC, + FIELD_PREP(SSO_CON1_FCDSC, val)); +} + +static int sso_gpio_freq_set(struct sso_led_priv *priv) +{ + int freq_idx; + u32 val; + + freq_idx = sso_gpio_get_freq_idx(priv->gpio.freq); + if (freq_idx == -1) + freq_idx = ARRAY_SIZE(freq_tbl) - 1; + + val = freq_idx % FPID_FREQ_RANK_MAX; + + if (!priv->gpio.freq) { + regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_BLINK_R, 0); + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_US, + FIELD_PREP(SSO_CON1_US, US_SW)); + } else if (freq_idx < FPID_FREQ_RANK_MAX) { + regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_BLINK_R, + SSO_CON0_BLINK_R); + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_US, + FIELD_PREP(SSO_CON1_US, US_FPID)); + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_FPID, + FIELD_PREP(SSO_CON1_FPID, val)); + } else { + regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_BLINK_R, + SSO_CON0_BLINK_R); + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_US, + FIELD_PREP(SSO_CON1_US, US_GPTC)); + regmap_update_bits(priv->mmap, SSO_CON1, SSO_CON1_GPTD, + FIELD_PREP(SSO_CON1_GPTD, val)); + } + + return 0; +} + +static int sso_gpio_hw_init(struct sso_led_priv *priv) +{ + u32 activate; + int i, err; + + /* Clear all duty cycles */ + for (i = 0; i < priv->gpio.pins; i++) { + err = regmap_write(priv->mmap, DUTY_CYCLE(i), 0); + if (err) + return err; + } + + /* 4 groups for total 32 pins */ + for (i = 1; i <= MAX_GROUP_NUM; i++) { + activate = !!(i * PINS_PER_GROUP <= priv->gpio.pins || + priv->gpio.pins > (i - 1) * PINS_PER_GROUP); + err = regmap_update_bits(priv->mmap, SSO_CON1, BIT(i - 1), + activate << (i - 1)); + if (err) + return err; + } + + /* NO HW directly controlled pin by default */ + err = regmap_write(priv->mmap, SSO_CON3, 0); + if (err) + return err; + + /* NO BLINK for all pins */ + err = regmap_write(priv->mmap, SSO_CON2, 0); + if (err) + return err; + + /* OUTPUT 0 by default */ + err = regmap_write(priv->mmap, SSO_CPU, 0); + if (err) + return err; + + /* update edge */ + err = regmap_update_bits(priv->mmap, SSO_CON0, SSO_CON0_RZFL, + FIELD_PREP(SSO_CON0_RZFL, priv->gpio.edge)); + if (err) + return err; + + /* Set GPIO update rate */ + sso_gpio_freq_set(priv); + + /* Register shift clock */ + sso_register_shift_clk(priv); + + return 0; +} + +static void sso_led_shutdown(struct sso_led *led) +{ + struct sso_led_priv *priv = led->priv; + + /* unregister led */ + devm_led_classdev_unregister(priv->dev, &led->cdev); + + /* clear HW control bit */ + if (led->desc.hw_trig) + regmap_update_bits(priv->mmap, SSO_CON3, BIT(led->desc.pin), 0); + + if (led->gpiod) + devm_gpiod_put(priv->dev, led->gpiod); + + led->priv = NULL; +} + +static int +__sso_led_dt_parse(struct sso_led_priv *priv, struct fwnode_handle *fw_ssoled) +{ + struct fwnode_handle *fwnode_child; + struct device *dev = priv->dev; + struct sso_led_desc *desc; + struct sso_led *led; + struct list_head *p; + const char *tmp; + u32 prop; + int ret; + + fwnode_for_each_child_node(fw_ssoled, fwnode_child) { + led = devm_kzalloc(dev, sizeof(*led), GFP_KERNEL); + if (!led) + return -ENOMEM; + + INIT_LIST_HEAD(&led->list); + led->priv = priv; + desc = &led->desc; + + led->gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL, + fwnode_child, + GPIOD_ASIS, NULL); + if (IS_ERR(led->gpiod)) { + dev_err(dev, "led: get gpio fail!\n"); + goto __dt_err; + } + + fwnode_property_read_string(fwnode_child, + "linux,default-trigger", + &desc->default_trigger); + + if (fwnode_property_present(fwnode_child, + "retain-state-suspended")) + desc->retain_state_suspended = 1; + + if (fwnode_property_present(fwnode_child, + "retain-state-shutdown")) + desc->retain_state_shutdown = 1; + + if (fwnode_property_present(fwnode_child, "panic-indicator")) + desc->panic_indicator = 1; + + ret = fwnode_property_read_u32(fwnode_child, "reg", &prop); + if (ret != 0 || prop >= SSO_LED_MAX_NUM) { + dev_err(dev, "invalid LED pin:%u\n", prop); + goto __dt_err; + } + desc->pin = prop; + + if (fwnode_property_present(fwnode_child, "intel,sso-hw-blink")) + desc->hw_blink = 1; + + desc->hw_trig = fwnode_property_read_bool(fwnode_child, + "intel,sso-hw-trigger"); + if (desc->hw_trig) { + desc->default_trigger = NULL; + desc->retain_state_shutdown = 0; + desc->retain_state_suspended = 0; + desc->panic_indicator = 0; + desc->hw_blink = 0; + } + + if (fwnode_property_read_u32(fwnode_child, + "intel,sso-blink-rate-hz", &prop)) { + /* default first freq rate */ + desc->freq_idx = 0; + desc->blink_rate = priv->freq[desc->freq_idx]; + } else { + desc->freq_idx = sso_get_blink_rate_idx(priv, prop); + if (desc->freq_idx == -1) + desc->freq_idx = MAX_FREQ_RANK - 1; + + desc->blink_rate = priv->freq[desc->freq_idx]; + } + + if (!fwnode_property_read_string(fwnode_child, "default-state", &tmp)) { + if (!strcmp(tmp, "on")) + desc->brightness = LED_FULL; + } + + if (sso_create_led(priv, led, fwnode_child)) + goto __dt_err; + } + fwnode_handle_put(fw_ssoled); + + return 0; +__dt_err: + fwnode_handle_put(fw_ssoled); + /* unregister leds */ + list_for_each(p, &priv->led_list) { + led = list_entry(p, struct sso_led, list); + sso_led_shutdown(led); + } + + return -EINVAL; +} + +static int sso_led_dt_parse(struct sso_led_priv *priv) +{ + struct fwnode_handle *fwnode = dev_fwnode(priv->dev); + struct fwnode_handle *fw_ssoled; + struct device *dev = priv->dev; + int count; + int ret; + + count = device_get_child_node_count(dev); + if (!count) + return 0; + + fw_ssoled = fwnode_get_named_child_node(fwnode, "ssoled"); + if (fw_ssoled) { + ret = __sso_led_dt_parse(priv, fw_ssoled); + if (ret) + return ret; + } + + return 0; +} + +static int sso_probe_gpios(struct sso_led_priv *priv) +{ + struct device *dev = priv->dev; + int ret; + + if (device_property_read_u32(dev, "ngpios", &priv->gpio.pins)) + priv->gpio.pins = MAX_PIN_NUM_PER_BANK; + + if (priv->gpio.pins > MAX_PIN_NUM_PER_BANK) + return -EINVAL; + + if (device_property_read_u32(dev, "intel,sso-update-rate-hz", + &priv->gpio.freq)) + priv->gpio.freq = 0; + + priv->gpio.edge = DATA_CLK_EDGE; + priv->gpio.shift_clk_freq = -1; + + ret = sso_gpio_hw_init(priv); + if (ret) + return ret; + + return sso_gpio_gc_init(dev, priv); +} + +static void sso_clk_disable(void *data) +{ + struct sso_led_priv *priv = data; + + clk_disable_unprepare(priv->fpid_clk); + clk_disable_unprepare(priv->gclk); +} + +static int intel_sso_led_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sso_led_priv *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->pdev = pdev; + priv->dev = dev; + + /* gate clock */ + priv->gclk = devm_clk_get(dev, "sso"); + if (IS_ERR(priv->gclk)) { + dev_err(dev, "get sso gate clock failed!\n"); + return PTR_ERR(priv->gclk); + } + + ret = clk_prepare_enable(priv->gclk); + if (ret) { + dev_err(dev, "Failed to prepate/enable sso gate clock!\n"); + return ret; + } + + priv->fpid_clk = devm_clk_get(dev, "fpid"); + if (IS_ERR(priv->fpid_clk)) { + dev_err(dev, "Failed to get fpid clock!\n"); + return PTR_ERR(priv->fpid_clk); + } + + ret = clk_prepare_enable(priv->fpid_clk); + if (ret) { + dev_err(dev, "Failed to prepare/enable fpid clock!\n"); + return ret; + } + priv->fpid_clkrate = clk_get_rate(priv->fpid_clk); + + ret = devm_add_action_or_reset(dev, sso_clk_disable, priv); + if (ret) { + dev_err(dev, "Failed to devm_add_action_or_reset, %d\n", ret); + return ret; + } + + priv->mmap = syscon_node_to_regmap(dev->of_node); + if (IS_ERR(priv->mmap)) { + dev_err(dev, "Failed to map iomem!\n"); + return PTR_ERR(priv->mmap); + } + + ret = sso_probe_gpios(priv); + if (ret) { + regmap_exit(priv->mmap); + return ret; + } + + INIT_LIST_HEAD(&priv->led_list); + + platform_set_drvdata(pdev, priv); + sso_init_freq(priv); + + priv->gptc_clkrate = DEF_GPTC_CLK_RATE; + + ret = sso_led_dt_parse(priv); + if (ret) { + regmap_exit(priv->mmap); + return ret; + } + dev_info(priv->dev, "sso LED init success!\n"); + + return 0; +} + +static int intel_sso_led_remove(struct platform_device *pdev) +{ + struct sso_led_priv *priv; + struct list_head *pos, *n; + struct sso_led *led; + + priv = platform_get_drvdata(pdev); + + list_for_each_safe(pos, n, &priv->led_list) { + list_del(pos); + led = list_entry(pos, struct sso_led, list); + sso_led_shutdown(led); + } + + clk_disable_unprepare(priv->fpid_clk); + clk_disable_unprepare(priv->gclk); + regmap_exit(priv->mmap); + + return 0; +} + +static const struct of_device_id of_sso_led_match[] = { + { .compatible = "intel,lgm-ssoled" }, + {} +}; + +MODULE_DEVICE_TABLE(of, of_sso_led_match); + +static struct platform_driver intel_sso_led_driver = { + .probe = intel_sso_led_probe, + .remove = intel_sso_led_remove, + .driver = { + .name = "lgm-ssoled", + .of_match_table = of_match_ptr(of_sso_led_match), + }, +}; + +module_platform_driver(intel_sso_led_driver); + +MODULE_DESCRIPTION("Intel SSO LED/GPIO driver"); +MODULE_LICENSE("GPL v2"); From 8e5c38a33c84935d66cfcf23c96960b6c4b484ef Mon Sep 17 00:00:00 2001 From: Gene Chen Date: Mon, 21 Dec 2020 18:45:50 +0800 Subject: [PATCH 190/506] leds: flash: Add flash registration with undefined CONFIG_LEDS_CLASS_FLASH Add flash registration with undefined CONFIG_LEDS_CLASS_FLASH, and move the same registration functions outside of #ifdef block. Signed-off-by: Gene Chen Acked-by: Jacek Anaszewski Signed-off-by: Pavel Machek --- include/linux/led-class-flash.h | 42 ++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h index 21a3358a1731..612b4cab3819 100644 --- a/include/linux/led-class-flash.h +++ b/include/linux/led-class-flash.h @@ -85,6 +85,7 @@ static inline struct led_classdev_flash *lcdev_to_flcdev( return container_of(lcdev, struct led_classdev_flash, led_cdev); } +#if IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) /** * led_classdev_flash_register_ext - register a new object of LED class with * init data and with support for flash LEDs @@ -98,12 +99,6 @@ int led_classdev_flash_register_ext(struct device *parent, struct led_classdev_flash *fled_cdev, struct led_init_data *init_data); -static inline int led_classdev_flash_register(struct device *parent, - struct led_classdev_flash *fled_cdev) -{ - return led_classdev_flash_register_ext(parent, fled_cdev, NULL); -} - /** * led_classdev_flash_unregister - unregisters an object of led_classdev class * with support for flash LEDs @@ -118,15 +113,44 @@ int devm_led_classdev_flash_register_ext(struct device *parent, struct led_init_data *init_data); +void devm_led_classdev_flash_unregister(struct device *parent, + struct led_classdev_flash *fled_cdev); + +#else + +static inline int led_classdev_flash_register_ext(struct device *parent, + struct led_classdev_flash *fled_cdev, + struct led_init_data *init_data) +{ + return 0; +} + +static inline void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) {}; +static inline int devm_led_classdev_flash_register_ext(struct device *parent, + struct led_classdev_flash *fled_cdev, + struct led_init_data *init_data) +{ + return 0; +} + +static inline void devm_led_classdev_flash_unregister(struct device *parent, + struct led_classdev_flash *fled_cdev) +{}; + +#endif /* IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) */ + +static inline int led_classdev_flash_register(struct device *parent, + struct led_classdev_flash *fled_cdev) +{ + return led_classdev_flash_register_ext(parent, fled_cdev, NULL); +} + static inline int devm_led_classdev_flash_register(struct device *parent, struct led_classdev_flash *fled_cdev) { return devm_led_classdev_flash_register_ext(parent, fled_cdev, NULL); } -void devm_led_classdev_flash_unregister(struct device *parent, - struct led_classdev_flash *fled_cdev); - /** * led_set_flash_strobe - setup flash strobe * @fled_cdev: the flash LED to set strobe on From 6039b7e87be0b350a5f8fc135adfb5d1f4ba66ad Mon Sep 17 00:00:00 2001 From: Gene Chen Date: Mon, 21 Dec 2020 18:45:51 +0800 Subject: [PATCH 191/506] leds: flash: Fix multicolor no-ops registration by return 0 Fix multicolor no-ops registration by return 0, and move the same registration functions outside of #ifdef block. Signed-off-by: Gene Chen Acked-by: Jacek Anaszewski Signed-off-by: Pavel Machek --- include/linux/led-class-multicolor.h | 42 ++++++++++------------------ 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h index 5116f9a866cc..210d57bcd767 100644 --- a/include/linux/led-class-multicolor.h +++ b/include/linux/led-class-multicolor.h @@ -44,12 +44,6 @@ int led_classdev_multicolor_register_ext(struct device *parent, struct led_classdev_mc *mcled_cdev, struct led_init_data *init_data); -static inline int led_classdev_multicolor_register(struct device *parent, - struct led_classdev_mc *mcled_cdev) -{ - return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL); -} - /** * led_classdev_multicolor_unregister - unregisters an object of led_classdev * class with support for multicolor LEDs @@ -68,13 +62,6 @@ int devm_led_classdev_multicolor_register_ext(struct device *parent, struct led_classdev_mc *mcled_cdev, struct led_init_data *init_data); -static inline int devm_led_classdev_multicolor_register(struct device *parent, - struct led_classdev_mc *mcled_cdev) -{ - return devm_led_classdev_multicolor_register_ext(parent, mcled_cdev, - NULL); -} - void devm_led_classdev_multicolor_unregister(struct device *parent, struct led_classdev_mc *mcled_cdev); #else @@ -83,27 +70,33 @@ static inline int led_classdev_multicolor_register_ext(struct device *parent, struct led_classdev_mc *mcled_cdev, struct led_init_data *init_data) { - return -EINVAL; -} - -static inline int led_classdev_multicolor_register(struct device *parent, - struct led_classdev_mc *mcled_cdev) -{ - return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL); + return 0; } static inline void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev) {}; static inline int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev, enum led_brightness brightness) { - return -EINVAL; + return 0; } static inline int devm_led_classdev_multicolor_register_ext(struct device *parent, struct led_classdev_mc *mcled_cdev, struct led_init_data *init_data) { - return -EINVAL; + return 0; +} + +static inline void devm_led_classdev_multicolor_unregister(struct device *parent, + struct led_classdev_mc *mcled_cdev) +{}; + +#endif /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */ + +static inline int led_classdev_multicolor_register(struct device *parent, + struct led_classdev_mc *mcled_cdev) +{ + return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL); } static inline int devm_led_classdev_multicolor_register(struct device *parent, @@ -113,9 +106,4 @@ static inline int devm_led_classdev_multicolor_register(struct device *parent, NULL); } -static inline void devm_led_classdev_multicolor_unregister(struct device *parent, - struct led_classdev_mc *mcled_cdev) -{}; - -#endif /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */ #endif /* _LINUX_MULTICOLOR_LEDS_H_INCLUDED */ From a2c42bbabbe260b7626d8459093631a6e16ee0ee Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 18 Feb 2021 14:03:46 +0000 Subject: [PATCH 192/506] arm64: spectre: Prevent lockdep splat on v4 mitigation enable path The Spectre-v4 workaround is re-configured when resuming from suspend, as the firmware may have re-enabled the mitigation despite the user previously asking for it to be disabled. Enabling or disabling the workaround can result in an undefined instruction exception on CPUs which implement PSTATE.SSBS but only allow it to be configured by adjusting the SPSR on exception return. We handle this by installing an 'undef hook' which effectively emulates the access. Installing this hook requires us to take a couple of spinlocks both to avoid corrupting the internal list of hooks but also to ensure that we don't run into an unhandled exception. Unfortunately, when resuming from suspend, we haven't yet called rcu_idle_exit() and so lockdep gets angry about "suspicious RCU usage". In doing so, it tries to print a warning, which leads it to get even more suspicious, this time about itself: | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by swapper/0: | #0: (logbuf_lock){-.-.}-{2:2}, at: vprintk_emit+0x88/0x198 | | Call trace: | dump_backtrace+0x0/0x1d8 | show_stack+0x18/0x24 | dump_stack+0xe0/0x17c | lockdep_rcu_suspicious+0x11c/0x134 | trace_lock_release+0xa0/0x160 | lock_release+0x3c/0x290 | _raw_spin_unlock+0x44/0x80 | vprintk_emit+0xbc/0x198 | vprintk_default+0x44/0x6c | vprintk_func+0x1f4/0x1fc | printk+0x54/0x7c | lockdep_rcu_suspicious+0x30/0x134 | trace_lock_acquire+0xa0/0x188 | lock_acquire+0x50/0x2fc | _raw_spin_lock+0x68/0x80 | spectre_v4_enable_mitigation+0xa8/0x30c | __cpu_suspend_exit+0xd4/0x1a8 | cpu_suspend+0xa0/0x104 | psci_cpu_suspend_enter+0x3c/0x5c | psci_enter_idle_state+0x44/0x74 | cpuidle_enter_state+0x148/0x2f8 | cpuidle_enter+0x38/0x50 | do_idle+0x1f0/0x2b4 Prevent these splats by running __cpu_suspend_exit() with RCU watching. Cc: Mark Rutland Cc: Lorenzo Pieralisi Cc: Peter Zijlstra Cc: Boqun Feng Cc: Marc Zyngier Cc: Saravana Kannan Suggested-by: "Paul E . McKenney" Reported-by: Sami Tolvanen Fixes: c28762070ca6 ("arm64: Rewrite Spectre-v4 mitigation code") Cc: Acked-by: Paul E. McKenney Acked-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210218140346.5224-1-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index a67b37a7a47e..d7564891ffe1 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -119,7 +119,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) if (!ret) ret = -EOPNOTSUPP; } else { - __cpu_suspend_exit(); + RCU_NONIDLE(__cpu_suspend_exit()); } unpause_graph_tracing(); From 656d1d58d8e0958d372db86c24f0b2ea36f50888 Mon Sep 17 00:00:00 2001 From: qiuguorui1 Date: Thu, 18 Feb 2021 20:59:00 +0800 Subject: [PATCH 193/506] arm64: kexec_file: fix memory leakage in create_dtb() when fdt_open_into() fails in function create_dtb(), if fdt_open_into() fails, we need to vfree buf before return. Fixes: 52b2a8af7436 ("arm64: kexec_file: load initrd and device-tree") Cc: stable@vger.kernel.org # v5.0 Signed-off-by: qiuguorui1 Link: https://lore.kernel.org/r/20210218125900.6810-1-qiuguorui1@huawei.com Signed-off-by: Will Deacon --- arch/arm64/kernel/machine_kexec_file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 03210f644790..0cde47a63beb 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -182,8 +182,10 @@ static int create_dtb(struct kimage *image, /* duplicate a device tree blob */ ret = fdt_open_into(initial_boot_params, buf, buf_size); - if (ret) + if (ret) { + vfree(buf); return -EINVAL; + } ret = setup_dtb(image, initrd_load_addr, initrd_len, cmdline, buf); From f5c6d0fcf90ce07ee0d686d465b19b247ebd5ed7 Mon Sep 17 00:00:00 2001 From: Shaoying Xu Date: Tue, 16 Feb 2021 18:32:34 +0000 Subject: [PATCH 194/506] arm64 module: set plt* section addresses to 0x0 These plt* and .text.ftrace_trampoline sections specified for arm64 have non-zero addressses. Non-zero section addresses in a relocatable ELF would confuse GDB when it tries to compute the section offsets and it ends up printing wrong symbol addresses. Therefore, set them to zero, which mirrors the change in commit 5d8591bc0fba ("module: set ksymtab/kcrctab* section addresses to 0x0"). Reported-by: Frank van der Linden Signed-off-by: Shaoying Xu Cc: Link: https://lore.kernel.org/r/20210216183234.GA23876@amazon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/module.lds.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h index 691f15af788e..810045628c66 100644 --- a/arch/arm64/include/asm/module.lds.h +++ b/arch/arm64/include/asm/module.lds.h @@ -1,7 +1,7 @@ #ifdef CONFIG_ARM64_MODULE_PLTS SECTIONS { - .plt (NOLOAD) : { BYTE(0) } - .init.plt (NOLOAD) : { BYTE(0) } - .text.ftrace_trampoline (NOLOAD) : { BYTE(0) } + .plt 0 (NOLOAD) : { BYTE(0) } + .init.plt 0 (NOLOAD) : { BYTE(0) } + .text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) } } #endif From 61c1e0eb8375def7c891bfe857bb795a57090526 Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Mon, 28 Dec 2020 11:38:00 +0100 Subject: [PATCH 195/506] i40e: Fix add TC filter for IPv6 Fix insufficient distinction between IPv4 and IPv6 addresses when creating a filter. IPv4 and IPv6 are kept in the same memory area. If IPv6 is added, then it's caught by IPv4 check, which leads to err -95. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mateusz Palczewski Reviewed-by: Jaroslaw Gawin Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2e22ab5a0f9a..3e4a4d6f0419 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7731,7 +7731,8 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, return -EOPNOTSUPP; /* adding filter using src_port/src_ip is not supported at this stage */ - if (filter->src_port || filter->src_ipv4 || + if (filter->src_port || + (filter->src_ipv4 && filter->n_proto != ETH_P_IPV6) || !ipv6_addr_any(&filter->ip.v6.src_ip6)) return -EOPNOTSUPP; @@ -7760,7 +7761,7 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT); } - } else if (filter->dst_ipv4 || + } else if ((filter->dst_ipv4 && filter->n_proto != ETH_P_IPV6) || !ipv6_addr_any(&filter->ip.v6.dst_ip6)) { cld_filter.element.flags = cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_IP_PORT); From b32cddd2247cf730731f93f1967d0147a40682c7 Mon Sep 17 00:00:00 2001 From: Norbert Ciosek Date: Fri, 5 Feb 2021 08:48:52 +0000 Subject: [PATCH 196/506] i40e: Fix endianness conversions Fixes the following sparse warnings: i40e_main.c:5953:32: warning: cast from restricted __le16 i40e_main.c:8008:29: warning: incorrect type in assignment (different base types) i40e_main.c:8008:29: expected unsigned int [assigned] [usertype] ipa i40e_main.c:8008:29: got restricted __le32 [usertype] i40e_main.c:8008:29: warning: incorrect type in assignment (different base types) i40e_main.c:8008:29: expected unsigned int [assigned] [usertype] ipa i40e_main.c:8008:29: got restricted __le32 [usertype] i40e_txrx.c:1950:59: warning: incorrect type in initializer (different base types) i40e_txrx.c:1950:59: expected unsigned short [usertype] vlan_tag i40e_txrx.c:1950:59: got restricted __le16 [usertype] l2tag1 i40e_txrx.c:1953:40: warning: cast to restricted __le16 i40e_xsk.c:448:38: warning: invalid assignment: |= i40e_xsk.c:448:38: left side has type restricted __le64 i40e_xsk.c:448:38: right side has type int Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Fixes: 2a508c64ad27 ("i40e: fix VLAN.TCI == 0 RX HW offload") Fixes: 3106c580fb7c ("i40e: Use batched xsk Tx interfaces to increase performance") Fixes: 8f88b3034db3 ("i40e: Add infrastructure for queue channel support") Signed-off-by: Norbert Ciosek Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 ++++++------ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3e4a4d6f0419..4a2d03cada01 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5920,7 +5920,7 @@ static int i40e_add_channel(struct i40e_pf *pf, u16 uplink_seid, ch->enabled_tc = !i40e_is_channel_macvlan(ch) && enabled_tc; ch->seid = ctxt.seid; ch->vsi_number = ctxt.vsi_number; - ch->stat_counter_idx = cpu_to_le16(ctxt.info.stat_counter_idx); + ch->stat_counter_idx = le16_to_cpu(ctxt.info.stat_counter_idx); /* copy just the sections touched not the entire info * since not all sections are valid as returned by @@ -7599,8 +7599,8 @@ static inline void i40e_set_cld_element(struct i40e_cloud_filter *filter, struct i40e_aqc_cloud_filters_element_data *cld) { - int i, j; u32 ipa; + int i; memset(cld, 0, sizeof(*cld)); ether_addr_copy(cld->outer_mac, filter->dst_mac); @@ -7611,14 +7611,14 @@ i40e_set_cld_element(struct i40e_cloud_filter *filter, if (filter->n_proto == ETH_P_IPV6) { #define IPV6_MAX_INDEX (ARRAY_SIZE(filter->dst_ipv6) - 1) - for (i = 0, j = 0; i < ARRAY_SIZE(filter->dst_ipv6); - i++, j += 2) { + for (i = 0; i < ARRAY_SIZE(filter->dst_ipv6); i++) { ipa = be32_to_cpu(filter->dst_ipv6[IPV6_MAX_INDEX - i]); - ipa = cpu_to_le32(ipa); - memcpy(&cld->ipaddr.raw_v6.data[j], &ipa, sizeof(ipa)); + + *(__le32 *)&cld->ipaddr.raw_v6.data[i * 2] = cpu_to_le32(ipa); } } else { ipa = be32_to_cpu(filter->dst_ipv4); + memcpy(&cld->ipaddr.v4.data, &ipa, sizeof(ipa)); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 32d97315f3f5..903d4e8cb0a1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1793,7 +1793,7 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring, skb_record_rx_queue(skb, rx_ring->queue_index); if (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) { - u16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1; + __le16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1; __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(vlan_tag)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 492ce213208d..37a21fb99922 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -444,7 +444,7 @@ static void i40e_set_rs_bit(struct i40e_ring *xdp_ring) struct i40e_tx_desc *tx_desc; tx_desc = I40E_TX_DESC(xdp_ring, ntu); - tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); + tx_desc->cmd_type_offset_bsz |= cpu_to_le64(I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); } /** From 1b40faf7e4abe10db2f730cf66b2b47551110940 Mon Sep 17 00:00:00 2001 From: Andreas Eberlein Date: Tue, 16 Feb 2021 14:30:28 +0100 Subject: [PATCH 197/506] leds: apu: extend support for PC Engines APU1 with newer firmware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DMI_PRODUCT_NAME entry on current firmware of PC Engines APU1 changed from "APU" to "apu1" This modification adds the missing DMI data and thereby the LED support for the PC Engines APU1 with firmware versions >= 4.6.8. Signed-off-by: Andreas Eberlein Tested-by: Zbyněk Kocur Signed-off-by: Pavel Machek --- drivers/leds/leds-apu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/leds/leds-apu.c b/drivers/leds/leds-apu.c index 7fd557aceff6..c409b80c236d 100644 --- a/drivers/leds/leds-apu.c +++ b/drivers/leds/leds-apu.c @@ -83,6 +83,7 @@ static const struct apu_led_profile apu1_led_profile[] = { }; static const struct dmi_system_id apu_led_dmi_table[] __initconst = { + /* PC Engines APU with factory bios "SageBios_PCEngines_APU-45" */ { .ident = "apu", .matches = { @@ -90,6 +91,14 @@ static const struct dmi_system_id apu_led_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "APU") } }, + /* PC Engines APU with "Mainline" bios >= 4.6.8 */ + { + .ident = "apu", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "PC Engines"), + DMI_MATCH(DMI_PRODUCT_NAME, "apu1") + } + }, {} }; MODULE_DEVICE_TABLE(dmi, apu_led_dmi_table); @@ -173,7 +182,7 @@ static int __init apu_led_init(void) int err; if (!(dmi_match(DMI_SYS_VENDOR, "PC Engines") && - dmi_match(DMI_PRODUCT_NAME, "APU"))) { + (dmi_match(DMI_PRODUCT_NAME, "APU") || dmi_match(DMI_PRODUCT_NAME, "apu1")))) { pr_err("No PC Engines APUv1 board detected. For APUv2,3 support, enable CONFIG_PCENGINES_APU2\n"); return -ENODEV; } From 9a10def9ceb5fa341d96a8b731dc2dc492e48d11 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:44 +0200 Subject: [PATCH 198/506] leds: lp50xx: Don't spam logs when probe is deferred When requesting GPIO line the probe can be deferred. In such case don't spam logs with an error message. This can be achieved by switching to dev_err_probe(). Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index f13117eed976..a2d18ec8fd2b 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -455,12 +455,9 @@ static int lp50xx_probe_dt(struct lp50xx *priv) int i = 0; priv->enable_gpio = devm_gpiod_get_optional(priv->dev, "enable", GPIOD_OUT_LOW); - if (IS_ERR(priv->enable_gpio)) { - ret = PTR_ERR(priv->enable_gpio); - dev_err(&priv->client->dev, "Failed to get enable gpio: %d\n", - ret); - return ret; - } + if (IS_ERR(priv->enable_gpio)) + return dev_err_probe(priv->dev, PTR_ERR(priv->enable_gpio), + "Failed to get enable GPIO\n"); priv->regulator = devm_regulator_get(priv->dev, "vled"); if (IS_ERR(priv->regulator)) From ea1ff99c9d235b8a54571d4292c71fce60993117 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:45 +0200 Subject: [PATCH 199/506] leds: lp50xx: Switch to new style i2c-driver probe function Switch to the new style i2c-driver probe_new probe function. Note we do not have any old style board files using this but user still has a possibility to instantiate device from sysfs. Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index a2d18ec8fd2b..19aec80e527a 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -526,8 +526,7 @@ static int lp50xx_probe_dt(struct lp50xx *priv) return ret; } -static int lp50xx_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int lp50xx_probe(struct i2c_client *client) { struct lp50xx *led; int count; @@ -547,7 +546,7 @@ static int lp50xx_probe(struct i2c_client *client, mutex_init(&led->lock); led->client = client; led->dev = &client->dev; - led->chip_info = &lp50xx_chip_info_tbl[id->driver_data]; + led->chip_info = device_get_match_data(&client->dev); i2c_set_clientdata(client, led); led->regmap = devm_regmap_init_i2c(client, led->chip_info->lp50xx_regmap_config); @@ -593,24 +592,24 @@ static int lp50xx_remove(struct i2c_client *client) } static const struct i2c_device_id lp50xx_id[] = { - { "lp5009", LP5009 }, - { "lp5012", LP5012 }, - { "lp5018", LP5018 }, - { "lp5024", LP5024 }, - { "lp5030", LP5030 }, - { "lp5036", LP5036 }, + { "lp5009", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5009] }, + { "lp5012", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5012] }, + { "lp5018", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5018] }, + { "lp5024", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5024] }, + { "lp5030", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5030] }, + { "lp5036", (kernel_ulong_t)&lp50xx_chip_info_tbl[LP5036] }, { } }; MODULE_DEVICE_TABLE(i2c, lp50xx_id); static const struct of_device_id of_lp50xx_leds_match[] = { - { .compatible = "ti,lp5009", .data = (void *)LP5009 }, - { .compatible = "ti,lp5012", .data = (void *)LP5012 }, - { .compatible = "ti,lp5018", .data = (void *)LP5018 }, - { .compatible = "ti,lp5024", .data = (void *)LP5024 }, - { .compatible = "ti,lp5030", .data = (void *)LP5030 }, - { .compatible = "ti,lp5036", .data = (void *)LP5036 }, - {}, + { .compatible = "ti,lp5009", .data = &lp50xx_chip_info_tbl[LP5009] }, + { .compatible = "ti,lp5012", .data = &lp50xx_chip_info_tbl[LP5012] }, + { .compatible = "ti,lp5018", .data = &lp50xx_chip_info_tbl[LP5018] }, + { .compatible = "ti,lp5024", .data = &lp50xx_chip_info_tbl[LP5024] }, + { .compatible = "ti,lp5030", .data = &lp50xx_chip_info_tbl[LP5030] }, + { .compatible = "ti,lp5036", .data = &lp50xx_chip_info_tbl[LP5036] }, + {} }; MODULE_DEVICE_TABLE(of, of_lp50xx_leds_match); @@ -619,7 +618,7 @@ static struct i2c_driver lp50xx_driver = { .name = "lp50xx", .of_match_table = of_lp50xx_leds_match, }, - .probe = lp50xx_probe, + .probe_new = lp50xx_probe, .remove = lp50xx_remove, .id_table = lp50xx_id, }; From 556f15fe023ec1d9f9cd2781ba6cd14bda650d22 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:46 +0200 Subject: [PATCH 200/506] leds: lp50xx: Reduce level of dereferences The priv->dev is effectively the same as &priv->client->dev. So, drop the latter for the former. Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index 19aec80e527a..0723b2688552 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -322,7 +322,7 @@ static int lp50xx_brightness_set(struct led_classdev *cdev, ret = regmap_write(led->priv->regmap, reg_val, brightness); if (ret) { - dev_err(&led->priv->client->dev, + dev_err(led->priv->dev, "Cannot write brightness value %d\n", ret); goto out; } @@ -338,7 +338,7 @@ static int lp50xx_brightness_set(struct led_classdev *cdev, ret = regmap_write(led->priv->regmap, reg_val, mc_dev->subled_info[i].intensity); if (ret) { - dev_err(&led->priv->client->dev, + dev_err(led->priv->dev, "Cannot write intensity value %d\n", ret); goto out; } @@ -404,7 +404,7 @@ static int lp50xx_probe_leds(struct fwnode_handle *child, struct lp50xx *priv, if (num_leds > 1) { if (num_leds > priv->chip_info->max_modules) { - dev_err(&priv->client->dev, "reg property is invalid\n"); + dev_err(priv->dev, "reg property is invalid\n"); return -EINVAL; } @@ -412,13 +412,13 @@ static int lp50xx_probe_leds(struct fwnode_handle *child, struct lp50xx *priv, ret = fwnode_property_read_u32_array(child, "reg", led_banks, num_leds); if (ret) { - dev_err(&priv->client->dev, "reg property is missing\n"); + dev_err(priv->dev, "reg property is missing\n"); return ret; } ret = lp50xx_set_banks(priv, led_banks); if (ret) { - dev_err(&priv->client->dev, "Cannot setup banked LEDs\n"); + dev_err(priv->dev, "Cannot setup banked LEDs\n"); return ret; } @@ -426,12 +426,12 @@ static int lp50xx_probe_leds(struct fwnode_handle *child, struct lp50xx *priv, } else { ret = fwnode_property_read_u32(child, "reg", &led_number); if (ret) { - dev_err(&priv->client->dev, "led reg property missing\n"); + dev_err(priv->dev, "led reg property missing\n"); return ret; } if (led_number > priv->chip_info->num_leds) { - dev_err(&priv->client->dev, "led-sources property is invalid\n"); + dev_err(priv->dev, "led-sources property is invalid\n"); return -EINVAL; } @@ -467,7 +467,7 @@ static int lp50xx_probe_dt(struct lp50xx *priv) led = &priv->leds[i]; ret = fwnode_property_count_u32(child, "reg"); if (ret < 0) { - dev_err(&priv->client->dev, "reg property is invalid\n"); + dev_err(priv->dev, "reg property is invalid\n"); goto child_out; } @@ -507,12 +507,11 @@ static int lp50xx_probe_dt(struct lp50xx *priv) led_cdev = &led->mc_cdev.led_cdev; led_cdev->brightness_set_blocking = lp50xx_brightness_set; - ret = devm_led_classdev_multicolor_register_ext(&priv->client->dev, + ret = devm_led_classdev_multicolor_register_ext(priv->dev, &led->mc_cdev, &init_data); if (ret) { - dev_err(&priv->client->dev, "led register err: %d\n", - ret); + dev_err(priv->dev, "led register err: %d\n", ret); goto child_out; } i++; @@ -575,15 +574,14 @@ static int lp50xx_remove(struct i2c_client *client) ret = lp50xx_enable_disable(led, 0); if (ret) { - dev_err(&led->client->dev, "Failed to disable chip\n"); + dev_err(led->dev, "Failed to disable chip\n"); return ret; } if (led->regulator) { ret = regulator_disable(led->regulator); if (ret) - dev_err(&led->client->dev, - "Failed to disable regulator\n"); + dev_err(led->dev, "Failed to disable regulator\n"); } mutex_destroy(&led->lock); From 5d2bfb3fb95b2d448c0fbcaa2c58b215b2fa87fc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:48 +0200 Subject: [PATCH 201/506] leds: lp50xx: Get rid of redundant check in lp50xx_enable_disable() Since GPIO is optional the API is NULL aware and will check descriptor anyway. Remove duplicate redundant check in lp50xx_enable_disable(). Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index 0723b2688552..ae82d4d7c9f3 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -382,11 +382,9 @@ static int lp50xx_enable_disable(struct lp50xx *priv, int enable_disable) { int ret; - if (priv->enable_gpio) { - ret = gpiod_direction_output(priv->enable_gpio, enable_disable); - if (ret) - return ret; - } + ret = gpiod_direction_output(priv->enable_gpio, enable_disable); + if (ret) + return ret; if (enable_disable) return regmap_write(priv->regmap, LP50XX_DEV_CFG0, LP50XX_CHIP_EN); From fb0f236beccbf74da90429d417864cfc6fc6673a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:50 +0200 Subject: [PATCH 202/506] leds: lp50xx: Update headers block to reflect reality The OF is not used in the driver, thus the OF headers are not needed, but mod_devicetable.h is missed. Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index ae82d4d7c9f3..1964483c94d3 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -6,10 +6,9 @@ #include #include #include +#include #include #include -#include -#include #include #include #include From b0a82efa51ad3ba1117817817cbabe9c9a37b893 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 16 Feb 2021 17:50:47 +0200 Subject: [PATCH 203/506] leds: lp50xx: Get rid of redundant explicit casting In the line like u32 bar = ...; u8 foo = (u8)(bar >> 8) & 0xff; is no need to have neither explicit casting nor ' & 0xff' part. Get rid of them. Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek --- drivers/leds/leds-lp50xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c index 1964483c94d3..06230614fdc5 100644 --- a/drivers/leds/leds-lp50xx.c +++ b/drivers/leds/leds-lp50xx.c @@ -359,8 +359,8 @@ static int lp50xx_set_banks(struct lp50xx *priv, u32 led_banks[]) bank_enable_mask |= (1 << led_banks[i]); } - led_config_lo = (u8)(bank_enable_mask & 0xff); - led_config_hi = (u8)(bank_enable_mask >> 8) & 0xff; + led_config_lo = bank_enable_mask; + led_config_hi = bank_enable_mask >> 8; ret = regmap_write(priv->regmap, LP50XX_LED_CFG0, led_config_lo); if (ret) From 3a2eb515d1367c0f667b76089a6e727279c688b8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Feb 2021 12:56:32 +0300 Subject: [PATCH 204/506] octeontx2-af: Fix an off by one in rvu_dbg_qsize_write() This code does not allocate enough memory for the NUL terminator so it ends up putting it one character beyond the end of the buffer. Fixes: 8756828a8148 ("octeontx2-af: Add NPA aura and pool contexts to debugfs") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 094124b695dc..aa2ca8780b9c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -473,7 +473,7 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp, u16 pcifunc; int ret, lf; - cmd_buf = memdup_user(buffer, count); + cmd_buf = memdup_user(buffer, count + 1); if (IS_ERR(cmd_buf)) return -ENOMEM; From d9b2a2bbbb4d0bc89129504eb1503bb8506158ed Mon Sep 17 00:00:00 2001 From: Lauri Kasanen Date: Sat, 23 Jan 2021 09:53:27 +0200 Subject: [PATCH 205/506] block: Add n64 cart driver This adds support for the Nintendo 64 console's carts. Carts are a read-only media ranging from 8mb to 64mb. Only one cart can be connected at once, and switching it requires a reboot. No module support to save RAM, as the target has 8mb RAM. Signed-off-by: Lauri Kasanen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Thomas Bogendoerfer --- drivers/block/Kconfig | 6 ++ drivers/block/Makefile | 1 + drivers/block/n64cart.c | 189 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 drivers/block/n64cart.c diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 262326973ee0..32ef21563e2a 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -66,6 +66,12 @@ config AMIGA_Z2RAM To compile this driver as a module, choose M here: the module will be called z2ram. +config N64CART + bool "N64 cart support" + depends on MACH_NINTENDO64 + help + Support for the N64 cart. + config CDROM tristate select BLK_SCSI_REQUEST diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a3170859e01d..2aeee3f5f3ce 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_PS3_DISK) += ps3disk.o obj-$(CONFIG_PS3_VRAM) += ps3vram.o obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o +obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_XILINX_SYSACE) += xsysace.o diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c new file mode 100644 index 000000000000..e76722acba46 --- /dev/null +++ b/drivers/block/n64cart.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for the N64 cart. + * + * Copyright (c) 2021 Lauri Kasanen + */ + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Lauri Kasanen "); +MODULE_DESCRIPTION("Driver for the N64 cart"); +MODULE_LICENSE("GPL"); + +static unsigned int start, size; +static u32 __iomem *reg_base; +static struct device *dev; + +#define PI_DRAM_REG 0 +#define PI_CART_REG 1 +#define PI_READ_REG 2 +#define PI_WRITE_REG 3 +#define PI_STATUS_REG 4 + +#define PI_STATUS_DMA_BUSY (1 << 0) +#define PI_STATUS_IO_BUSY (1 << 1) + +#define CART_DOMAIN 0x10000000 +#define CART_MAX 0x1FFFFFFF + +#define MIN_ALIGNMENT 8 + +static void n64cart_write_reg(const u8 reg, const u32 value) +{ + writel(value, reg_base + reg); +} + +static u32 n64cart_read_reg(const u8 reg) +{ + return readl(reg_base + reg); +} + +static void n64cart_wait_dma(void) +{ + while (n64cart_read_reg(PI_STATUS_REG) & + (PI_STATUS_DMA_BUSY | PI_STATUS_IO_BUSY)) + cpu_relax(); +} + +/* + * Process a single bvec of a bio. + */ +static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos) +{ + dma_addr_t dma_addr; + const u32 bstart = pos + start; + + /* Alignment check */ + WARN_ON_ONCE((bv->bv_offset & (MIN_ALIGNMENT - 1)) || + (bv->bv_len & (MIN_ALIGNMENT - 1))); + + dma_addr = dma_map_bvec(dev, bv, DMA_FROM_DEVICE, 0); + if (dma_mapping_error(dev, dma_addr)) + return false; + + n64cart_wait_dma(); + + n64cart_write_reg(PI_DRAM_REG, dma_addr + bv->bv_offset); + n64cart_write_reg(PI_CART_REG, (bstart | CART_DOMAIN) & CART_MAX); + n64cart_write_reg(PI_WRITE_REG, bv->bv_len - 1); + + n64cart_wait_dma(); + + dma_unmap_page(dev, dma_addr, bv->bv_len, DMA_FROM_DEVICE); + return true; +} + +static blk_qc_t n64cart_submit_bio(struct bio *bio) +{ + struct bio_vec bvec; + u32 pos; + struct bvec_iter iter; + + pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + bio_for_each_segment(bvec, bio, iter) { + if (!n64cart_do_bvec(dev, &bvec, pos)) + goto io_error; + pos += bvec.bv_len; + } + + bio_endio(bio); + return BLK_QC_T_NONE; +io_error: + bio_io_error(bio); + return BLK_QC_T_NONE; +} + +static const struct block_device_operations n64cart_fops = { + .owner = THIS_MODULE, + .submit_bio = n64cart_submit_bio, +}; + +/* + * The target device is embedded and RAM-constrained. We save RAM + * by initializing in __init code that gets dropped late in boot. + * For the same reason there is no module or unloading support. + */ +static int __init n64cart_probe(struct platform_device *pdev) +{ + int err; + struct request_queue *queue; + struct gendisk *disk; + + if (!start || !size) { + pr_err("n64cart: start and size not specified\n"); + return -ENODEV; + } + + if (size & 4095) { + pr_err("n64cart: size must be a multiple of 4K\n"); + return -ENODEV; + } + + queue = blk_alloc_queue(NUMA_NO_NODE); + if (!queue) { + return -ENOMEM; + } + + reg_base = devm_platform_ioremap_resource(pdev, 0); + if (!reg_base) { + err = -EINVAL; + goto fail_queue; + } + + disk = alloc_disk(0); + if (!disk) { + err = -ENOMEM; + goto fail_queue; + } + + dev = &pdev->dev; + + disk->first_minor = 0; + disk->queue = queue; + disk->flags = GENHD_FL_NO_PART_SCAN | GENHD_FL_EXT_DEVT; + disk->fops = &n64cart_fops; + strcpy(disk->disk_name, "n64cart"); + + set_capacity(disk, size / 512); + set_disk_ro(disk, 1); + + blk_queue_flag_set(QUEUE_FLAG_NONROT, queue); + blk_queue_physical_block_size(queue, 4096); + blk_queue_logical_block_size(queue, 4096); + + add_disk(disk); + + pr_info("n64cart: %u kb disk\n", size / 1024); + + return 0; +fail_queue: + blk_cleanup_queue(queue); + + return err; +} + +static struct platform_driver n64cart_driver = { + .driver = { + .name = "n64cart", + }, +}; + +static int __init n64cart_init(void) +{ + return platform_driver_probe(&n64cart_driver, n64cart_probe); +} + +module_param(start, uint, 0); +MODULE_PARM_DESC(start, "Start address of the cart block data"); + +module_param(size, uint, 0); +MODULE_PARM_DESC(size, "Size of the cart block data, in bytes"); + +module_init(n64cart_init); From f1e19224f5948ae61ff9972d35d3cd7176815cd9 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:35 -0800 Subject: [PATCH 206/506] n64: use pr_fmt to avoid duplicate string Instead of repeating the n64cart string all over the module use pr_fmt macro and remove the duplicate string. Also, replace and with or in the one of the error message. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index e76722acba46..8c7c9249071b 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -5,6 +5,7 @@ * Copyright (c) 2021 Lauri Kasanen */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include @@ -117,12 +118,12 @@ static int __init n64cart_probe(struct platform_device *pdev) struct gendisk *disk; if (!start || !size) { - pr_err("n64cart: start and size not specified\n"); + pr_err("start or size not specified\n"); return -ENODEV; } if (size & 4095) { - pr_err("n64cart: size must be a multiple of 4K\n"); + pr_err("size must be a multiple of 4K\n"); return -ENODEV; } From 9ee8c9a1c752f6181c1403fa5b4da620b410d9af Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:36 -0800 Subject: [PATCH 207/506] n64: move module info at the end Move the module auth, description, and license at the end of the file just like what we have for the other modules. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 8c7c9249071b..63090030ed2b 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -13,10 +13,6 @@ #include #include -MODULE_AUTHOR("Lauri Kasanen "); -MODULE_DESCRIPTION("Driver for the N64 cart"); -MODULE_LICENSE("GPL"); - static unsigned int start, size; static u32 __iomem *reg_base; static struct device *dev; @@ -188,3 +184,7 @@ module_param(size, uint, 0); MODULE_PARM_DESC(size, "Size of the cart block data, in bytes"); module_init(n64cart_init); + +MODULE_AUTHOR("Lauri Kasanen "); +MODULE_DESCRIPTION("Driver for the N64 cart"); +MODULE_LICENSE("GPL"); From e39e31326305d9bb35f8ab78c4310b9a38bbb3aa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:37 -0800 Subject: [PATCH 208/506] n64: move module param at the top Move module parameters at the top of the file after macro definition & global variables below macro definitions just like we have for other modules. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 63090030ed2b..b18f034ee1ad 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -13,10 +13,6 @@ #include #include -static unsigned int start, size; -static u32 __iomem *reg_base; -static struct device *dev; - #define PI_DRAM_REG 0 #define PI_CART_REG 1 #define PI_READ_REG 2 @@ -31,6 +27,17 @@ static struct device *dev; #define MIN_ALIGNMENT 8 +static u32 __iomem *reg_base; +static struct device *dev; + +static unsigned int start; +module_param(start, uint, 0); +MODULE_PARM_DESC(start, "Start address of the cart block data"); + +static unsigned int size; +module_param(size, uint, 0); +MODULE_PARM_DESC(size, "Size of the cart block data, in bytes"); + static void n64cart_write_reg(const u8 reg, const u32 value) { writel(value, reg_base + reg); @@ -177,12 +184,6 @@ static int __init n64cart_init(void) return platform_driver_probe(&n64cart_driver, n64cart_probe); } -module_param(start, uint, 0); -MODULE_PARM_DESC(start, "Start address of the cart block data"); - -module_param(size, uint, 0); -MODULE_PARM_DESC(size, "Size of the cart block data, in bytes"); - module_init(n64cart_init); MODULE_AUTHOR("Lauri Kasanen "); From 2ce503b35dcea29767c6d03b44e3c535809fdfcc Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:38 -0800 Subject: [PATCH 209/506] n64: use enums for reg Macros tend to be not type-safe. Use enum for register definitions. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index b18f034ee1ad..620f9e080d5d 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -13,11 +13,13 @@ #include #include -#define PI_DRAM_REG 0 -#define PI_CART_REG 1 -#define PI_READ_REG 2 -#define PI_WRITE_REG 3 -#define PI_STATUS_REG 4 +enum { + PI_DRAM_REG = 0, + PI_CART_REG, + PI_READ_REG, + PI_WRITE_REG, + PI_STATUS_REG, +}; #define PI_STATUS_DMA_BUSY (1 << 0) #define PI_STATUS_IO_BUSY (1 << 1) From 857f6fde1c6e800b685c2da864dabd7ff9091dca Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:39 -0800 Subject: [PATCH 210/506] n64: use sector SECTOR_SHIFT instead 512 Instead of using magic numbers use SECTOR_SHIFT to get the number of sectors from the size. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 620f9e080d5d..c83a6af5a718 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -157,7 +157,7 @@ static int __init n64cart_probe(struct platform_device *pdev) disk->fops = &n64cart_fops; strcpy(disk->disk_name, "n64cart"); - set_capacity(disk, size / 512); + set_capacity(disk, size >> SECTOR_SHIFT); set_disk_ro(disk, 1); blk_queue_flag_set(QUEUE_FLAG_NONROT, queue); From 82a0c13a08d8265fe6412f8683a6011ce881df49 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:40 -0800 Subject: [PATCH 211/506] n64: remove curly brackets Remove extra braces for the if which has only single statement. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index c83a6af5a718..7906b5b2f12e 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -133,9 +133,8 @@ static int __init n64cart_probe(struct platform_device *pdev) } queue = blk_alloc_queue(NUMA_NO_NODE); - if (!queue) { + if (!queue) return -ENOMEM; - } reg_base = devm_platform_ioremap_resource(pdev, 0); if (!reg_base) { From 37772f9136f442a1098d0ae1238def72f1216057 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:41 -0800 Subject: [PATCH 212/506] n64: cosmetics changes Make the variable declaration ascending order and initialize the variables at the time of declaration when possible. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 7906b5b2f12e..3bfb010402e3 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -88,10 +88,8 @@ static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos) static blk_qc_t n64cart_submit_bio(struct bio *bio) { struct bio_vec bvec; - u32 pos; struct bvec_iter iter; - - pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; bio_for_each_segment(bvec, bio, iter) { if (!n64cart_do_bvec(dev, &bvec, pos)) @@ -119,8 +117,8 @@ static const struct block_device_operations n64cart_fops = { static int __init n64cart_probe(struct platform_device *pdev) { int err; - struct request_queue *queue; struct gendisk *disk; + struct request_queue *queue; if (!start || !size) { pr_err("start or size not specified\n"); From 0d424780852eb60467a6f053d92495bb845ac186 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:42 -0800 Subject: [PATCH 213/506] n64: cleanup n64cart_probe() The goto label fail_queue is needed to cleanup the queue allocation when devm_platform_ioremap_resource() or alloc_disk() fails, either of these two functions are not dependent on the queue variable which is allocated prior to these calls. Allocate the queue variable after successful alloc_disk(). Return error directly when devm_platform_ioremap_resource() or alloc_disk() fail. Remove fail_queue label and a call to the blk_cleanup_queue(). Direct return from these two functions allows us to remove the local variable err and allocating queue after alloc_disk() allows us to remove the local variable queue so we use disk->queue directly. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 3bfb010402e3..43482d158640 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -116,9 +116,7 @@ static const struct block_device_operations n64cart_fops = { */ static int __init n64cart_probe(struct platform_device *pdev) { - int err; struct gendisk *disk; - struct request_queue *queue; if (!start || !size) { pr_err("start or size not specified\n"); @@ -130,26 +128,21 @@ static int __init n64cart_probe(struct platform_device *pdev) return -ENODEV; } - queue = blk_alloc_queue(NUMA_NO_NODE); - if (!queue) - return -ENOMEM; - reg_base = devm_platform_ioremap_resource(pdev, 0); - if (!reg_base) { - err = -EINVAL; - goto fail_queue; - } + if (!reg_base) + return -EINVAL; disk = alloc_disk(0); - if (!disk) { - err = -ENOMEM; - goto fail_queue; - } + if (!disk) + return -ENOMEM; + + disk->queue = blk_alloc_queue(NUMA_NO_NODE); + if (!disk->queue) + return -ENOMEM; dev = &pdev->dev; disk->first_minor = 0; - disk->queue = queue; disk->flags = GENHD_FL_NO_PART_SCAN | GENHD_FL_EXT_DEVT; disk->fops = &n64cart_fops; strcpy(disk->disk_name, "n64cart"); @@ -157,19 +150,15 @@ static int __init n64cart_probe(struct platform_device *pdev) set_capacity(disk, size >> SECTOR_SHIFT); set_disk_ro(disk, 1); - blk_queue_flag_set(QUEUE_FLAG_NONROT, queue); - blk_queue_physical_block_size(queue, 4096); - blk_queue_logical_block_size(queue, 4096); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_physical_block_size(disk->queue, 4096); + blk_queue_logical_block_size(disk->queue, 4096); add_disk(disk); pr_info("n64cart: %u kb disk\n", size / 1024); return 0; -fail_queue: - blk_cleanup_queue(queue); - - return err; } static struct platform_driver n64cart_driver = { From 13d41b537df7d2538f901aa98f82672482b50d12 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 25 Jan 2021 15:32:43 -0800 Subject: [PATCH 214/506] n64: store dev instance into disk private data The device instance is declared globally. Remove global variable & use the disk->private_data to store the device instance in the n64cart_probe() and get the same instance from bio->bi_disk->private data in n64cart_submit_bio. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Lauri Kasanen Signed-off-by: Thomas Bogendoerfer --- drivers/block/n64cart.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 43482d158640..47bdf324e962 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -30,7 +30,6 @@ enum { #define MIN_ALIGNMENT 8 static u32 __iomem *reg_base; -static struct device *dev; static unsigned int start; module_param(start, uint, 0); @@ -89,6 +88,7 @@ static blk_qc_t n64cart_submit_bio(struct bio *bio) { struct bio_vec bvec; struct bvec_iter iter; + struct device *dev = bio->bi_disk->private_data; u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; bio_for_each_segment(bvec, bio, iter) { @@ -140,11 +140,10 @@ static int __init n64cart_probe(struct platform_device *pdev) if (!disk->queue) return -ENOMEM; - dev = &pdev->dev; - disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART_SCAN | GENHD_FL_EXT_DEVT; disk->fops = &n64cart_fops; + disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); set_capacity(disk, size >> SECTOR_SHIFT); From 865fa29f7dd1b6af8498fe08f19b4028c1c8a153 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 15 Feb 2021 09:48:22 +0900 Subject: [PATCH 215/506] arch: syscalls: add missing FORCE and fix 'targets' to make if_changed work The rules in these Makefiles cannot detect the command line change because the prerequisite 'FORCE' is missing. Adding 'FORCE' will result in the headers being rebuilt every time because the 'targets' additions are also wrong; the file paths in 'targets' must be relative to the current Makefile. Fix all of them so the if_changed rules work correctly. Signed-off-by: Masahiro Yamada Acked-by: Geert Uytterhoeven --- arch/alpha/kernel/syscalls/Makefile | 11 +++++----- arch/ia64/kernel/syscalls/Makefile | 11 +++++----- arch/m68k/kernel/syscalls/Makefile | 11 +++++----- arch/microblaze/kernel/syscalls/Makefile | 11 +++++----- arch/mips/kernel/syscalls/Makefile | 27 ++++++++++++------------ arch/parisc/kernel/syscalls/Makefile | 17 ++++++++------- arch/powerpc/kernel/syscalls/Makefile | 19 +++++++++-------- arch/sh/kernel/syscalls/Makefile | 11 +++++----- arch/sparc/kernel/syscalls/Makefile | 17 ++++++++------- arch/x86/entry/syscalls/Makefile | 25 +++++++++++----------- arch/xtensa/kernel/syscalls/Makefile | 11 +++++----- 11 files changed, 91 insertions(+), 80 deletions(-) diff --git a/arch/alpha/kernel/syscalls/Makefile b/arch/alpha/kernel/syscalls/Makefile index 659faefdcb1d..1c42d2d2926d 100644 --- a/arch/alpha/kernel/syscalls/Makefile +++ b/arch/alpha/kernel/syscalls/Makefile @@ -21,18 +21,19 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_abi_$(basetarget))' \ '$(systbl_offset_$(basetarget))' -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/ia64/kernel/syscalls/Makefile b/arch/ia64/kernel/syscalls/Makefile index 813a58cba39c..b9bfd186295f 100644 --- a/arch/ia64/kernel/syscalls/Makefile +++ b/arch/ia64/kernel/syscalls/Makefile @@ -22,19 +22,20 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_offset_$(basetarget))' syshdr_offset_unistd_64 := __NR_Linux -$(uapi)/unistd_64.h: $(syscall) $(syshdr) +$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) systbl_offset_syscall_table := 1024 -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_64.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/m68k/kernel/syscalls/Makefile b/arch/m68k/kernel/syscalls/Makefile index 659faefdcb1d..1c42d2d2926d 100644 --- a/arch/m68k/kernel/syscalls/Makefile +++ b/arch/m68k/kernel/syscalls/Makefile @@ -21,18 +21,19 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_abi_$(basetarget))' \ '$(systbl_offset_$(basetarget))' -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/microblaze/kernel/syscalls/Makefile b/arch/microblaze/kernel/syscalls/Makefile index 659faefdcb1d..1c42d2d2926d 100644 --- a/arch/microblaze/kernel/syscalls/Makefile +++ b/arch/microblaze/kernel/syscalls/Makefile @@ -21,18 +21,19 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_abi_$(basetarget))' \ '$(systbl_offset_$(basetarget))' -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/mips/kernel/syscalls/Makefile b/arch/mips/kernel/syscalls/Makefile index 6efb2f6889a7..f15842bda464 100644 --- a/arch/mips/kernel/syscalls/Makefile +++ b/arch/mips/kernel/syscalls/Makefile @@ -31,50 +31,50 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_offset_$(basetarget))' syshdr_offset_unistd_n32 := __NR_Linux -$(uapi)/unistd_n32.h: $(syscalln32) $(syshdr) +$(uapi)/unistd_n32.h: $(syscalln32) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_offset_unistd_n64 := __NR_Linux -$(uapi)/unistd_n64.h: $(syscalln64) $(syshdr) +$(uapi)/unistd_n64.h: $(syscalln64) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_offset_unistd_o32 := __NR_Linux -$(uapi)/unistd_o32.h: $(syscallo32) $(syshdr) +$(uapi)/unistd_o32.h: $(syscallo32) $(syshdr) FORCE $(call if_changed,syshdr) sysnr_pfx_unistd_nr_n32 := N32 sysnr_offset_unistd_nr_n32 := 6000 -$(uapi)/unistd_nr_n32.h: $(syscalln32) $(sysnr) +$(uapi)/unistd_nr_n32.h: $(syscalln32) $(sysnr) FORCE $(call if_changed,sysnr) sysnr_pfx_unistd_nr_n64 := 64 sysnr_offset_unistd_nr_n64 := 5000 -$(uapi)/unistd_nr_n64.h: $(syscalln64) $(sysnr) +$(uapi)/unistd_nr_n64.h: $(syscalln64) $(sysnr) FORCE $(call if_changed,sysnr) sysnr_pfx_unistd_nr_o32 := O32 sysnr_offset_unistd_nr_o32 := 4000 -$(uapi)/unistd_nr_o32.h: $(syscallo32) $(sysnr) +$(uapi)/unistd_nr_o32.h: $(syscallo32) $(sysnr) FORCE $(call if_changed,sysnr) systbl_abi_syscall_table_32_o32 := 32_o32 systbl_offset_syscall_table_32_o32 := 4000 -$(kapi)/syscall_table_32_o32.h: $(syscallo32) $(systbl) +$(kapi)/syscall_table_32_o32.h: $(syscallo32) $(systbl) FORCE $(call if_changed,systbl) systbl_abi_syscall_table_64_n32 := 64_n32 systbl_offset_syscall_table_64_n32 := 6000 -$(kapi)/syscall_table_64_n32.h: $(syscalln32) $(systbl) +$(kapi)/syscall_table_64_n32.h: $(syscalln32) $(systbl) FORCE $(call if_changed,systbl) systbl_abi_syscall_table_64_n64 := 64_n64 systbl_offset_syscall_table_64_n64 := 5000 -$(kapi)/syscall_table_64_n64.h: $(syscalln64) $(systbl) +$(kapi)/syscall_table_64_n64.h: $(syscalln64) $(systbl) FORCE $(call if_changed,systbl) systbl_abi_syscall_table_64_o32 := 64_o32 systbl_offset_syscall_table_64_o32 := 4000 -$(kapi)/syscall_table_64_o32.h: $(syscallo32) $(systbl) +$(kapi)/syscall_table_64_o32.h: $(syscallo32) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_n32.h \ @@ -88,9 +88,10 @@ kapisyshdr-y += syscall_table_32_o32.h \ syscall_table_64_n64.h \ syscall_table_64_o32.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/parisc/kernel/syscalls/Makefile b/arch/parisc/kernel/syscalls/Makefile index c22a21c39f30..556fe30a6c8f 100644 --- a/arch/parisc/kernel/syscalls/Makefile +++ b/arch/parisc/kernel/syscalls/Makefile @@ -22,24 +22,24 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_offset_$(basetarget))' syshdr_abis_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abis_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) $(syshdr) +$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) systbl_abis_syscall_table_32 := common,32 -$(kapi)/syscall_table_32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_64 := common,64 -$(kapi)/syscall_table_64.h: $(syscall) $(systbl) +$(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_c32 := common,32 systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h unistd_64.h @@ -47,9 +47,10 @@ kapisyshdr-y += syscall_table_32.h \ syscall_table_64.h \ syscall_table_c32.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile index 27b48954808d..d609f0040b2a 100644 --- a/arch/powerpc/kernel/syscalls/Makefile +++ b/arch/powerpc/kernel/syscalls/Makefile @@ -22,31 +22,31 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_offset_$(basetarget))' syshdr_abis_unistd_32 := common,nospu,32 -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abis_unistd_64 := common,nospu,64 -$(uapi)/unistd_64.h: $(syscall) $(syshdr) +$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) systbl_abis_syscall_table_32 := common,nospu,32 systbl_abi_syscall_table_32 := 32 -$(kapi)/syscall_table_32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_64 := common,nospu,64 systbl_abi_syscall_table_64 := 64 -$(kapi)/syscall_table_64.h: $(syscall) $(systbl) +$(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_c32 := common,nospu,32 systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_spu := common,spu systbl_abi_syscall_table_spu := spu -$(kapi)/syscall_table_spu.h: $(syscall) $(systbl) +$(kapi)/syscall_table_spu.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h unistd_64.h @@ -55,9 +55,10 @@ kapisyshdr-y += syscall_table_32.h \ syscall_table_c32.h \ syscall_table_spu.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/sh/kernel/syscalls/Makefile b/arch/sh/kernel/syscalls/Makefile index 659faefdcb1d..1c42d2d2926d 100644 --- a/arch/sh/kernel/syscalls/Makefile +++ b/arch/sh/kernel/syscalls/Makefile @@ -21,18 +21,19 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_abi_$(basetarget))' \ '$(systbl_offset_$(basetarget))' -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/sparc/kernel/syscalls/Makefile b/arch/sparc/kernel/syscalls/Makefile index c22a21c39f30..556fe30a6c8f 100644 --- a/arch/sparc/kernel/syscalls/Makefile +++ b/arch/sparc/kernel/syscalls/Makefile @@ -22,24 +22,24 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_offset_$(basetarget))' syshdr_abis_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abis_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) $(syshdr) +$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) systbl_abis_syscall_table_32 := common,32 -$(kapi)/syscall_table_32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_64 := common,64 -$(kapi)/syscall_table_64.h: $(syscall) $(systbl) +$(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) systbl_abis_syscall_table_c32 := common,32 systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) +$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h unistd_64.h @@ -47,9 +47,10 @@ kapisyshdr-y += syscall_table_32.h \ syscall_table_64.h \ syscall_table_c32.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 6fb9b57ed5ba..b0adc1704fde 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -21,37 +21,37 @@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ - cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) syshdr_abi_unistd_32 := i386 -$(uapi)/unistd_32.h: $(syscall32) $(syshdr) +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_32_ia32 := i386 syshdr_pfx_unistd_32_ia32 := ia32_ -$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_x32 := common,x32 syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT -$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall64) $(syshdr) +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_64_x32 := x32 syshdr_pfx_unistd_64_x32 := x32_ -$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -$(out)/syscalls_32.h: $(syscall32) $(systbl) +$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE $(call if_changed,systbl) -$(out)/syscalls_64.h: $(syscall64) $(systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE $(call if_changed,systbl) -$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE $(call if_changed,hypercalls) $(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h @@ -62,9 +62,10 @@ syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h syshdr-$(CONFIG_XEN) += xen-hypercalls.h -targets += $(uapisyshdr-y) $(syshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +syshdr-y := $(addprefix $(out)/, $(syshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(syshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(out)/,$(syshdr-y)) +all: $(uapisyshdr-y) $(syshdr-y) @: diff --git a/arch/xtensa/kernel/syscalls/Makefile b/arch/xtensa/kernel/syscalls/Makefile index 659faefdcb1d..1c42d2d2926d 100644 --- a/arch/xtensa/kernel/syscalls/Makefile +++ b/arch/xtensa/kernel/syscalls/Makefile @@ -21,18 +21,19 @@ quiet_cmd_systbl = SYSTBL $@ '$(systbl_abi_$(basetarget))' \ '$(systbl_offset_$(basetarget))' -$(uapi)/unistd_32.h: $(syscall) $(syshdr) +$(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) $(systbl) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h kapisyshdr-y += syscall_table.h -targets += $(uapisyshdr-y) $(kapisyshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(kapi)/,$(kapisyshdr-y)) +all: $(uapisyshdr-y) $(kapisyshdr-y) @: From 29c5c3ac633161f4ae2f4bb5f278b3719391b20e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 15 Feb 2021 09:48:23 +0900 Subject: [PATCH 216/506] arch: syscalls: remove $(srctree)/ prefix from syscall tables The 'syscall' variables are not directly used in the commands. Remove the $(srctree)/ prefix because we can rely on VPATH. Signed-off-by: Masahiro Yamada --- arch/alpha/kernel/syscalls/Makefile | 2 +- arch/arm/tools/Makefile | 2 +- arch/ia64/kernel/syscalls/Makefile | 2 +- arch/m68k/kernel/syscalls/Makefile | 2 +- arch/microblaze/kernel/syscalls/Makefile | 2 +- arch/mips/kernel/syscalls/Makefile | 6 +++--- arch/parisc/kernel/syscalls/Makefile | 2 +- arch/powerpc/kernel/syscalls/Makefile | 2 +- arch/sh/kernel/syscalls/Makefile | 2 +- arch/sparc/kernel/syscalls/Makefile | 2 +- arch/x86/entry/syscalls/Makefile | 4 ++-- arch/xtensa/kernel/syscalls/Makefile | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/alpha/kernel/syscalls/Makefile b/arch/alpha/kernel/syscalls/Makefile index 1c42d2d2926d..285aaba832d9 100644 --- a/arch/alpha/kernel/syscalls/Makefile +++ b/arch/alpha/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/arm/tools/Makefile b/arch/arm/tools/Makefile index 27d8beb7c941..3654f979851b 100644 --- a/arch/arm/tools/Makefile +++ b/arch/arm/tools/Makefile @@ -11,7 +11,7 @@ uapi := $(gen)/uapi/asm syshdr := $(srctree)/$(src)/syscallhdr.sh sysnr := $(srctree)/$(src)/syscallnr.sh systbl := $(srctree)/$(src)/syscalltbl.sh -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl gen-y := $(gen)/calls-oabi.S gen-y += $(gen)/calls-eabi.S diff --git a/arch/ia64/kernel/syscalls/Makefile b/arch/ia64/kernel/syscalls/Makefile index b9bfd186295f..bf4bda0f63eb 100644 --- a/arch/ia64/kernel/syscalls/Makefile +++ b/arch/ia64/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/m68k/kernel/syscalls/Makefile b/arch/m68k/kernel/syscalls/Makefile index 1c42d2d2926d..285aaba832d9 100644 --- a/arch/m68k/kernel/syscalls/Makefile +++ b/arch/m68k/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/microblaze/kernel/syscalls/Makefile b/arch/microblaze/kernel/syscalls/Makefile index 1c42d2d2926d..285aaba832d9 100644 --- a/arch/microblaze/kernel/syscalls/Makefile +++ b/arch/microblaze/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/mips/kernel/syscalls/Makefile b/arch/mips/kernel/syscalls/Makefile index f15842bda464..ed22b711ccb7 100644 --- a/arch/mips/kernel/syscalls/Makefile +++ b/arch/mips/kernel/syscalls/Makefile @@ -5,9 +5,9 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscalln32 := $(srctree)/$(src)/syscall_n32.tbl -syscalln64 := $(srctree)/$(src)/syscall_n64.tbl -syscallo32 := $(srctree)/$(src)/syscall_o32.tbl +syscalln32 := $(src)/syscall_n32.tbl +syscalln64 := $(src)/syscall_n64.tbl +syscallo32 := $(src)/syscall_o32.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh sysnr := $(srctree)/$(src)/syscallnr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/parisc/kernel/syscalls/Makefile b/arch/parisc/kernel/syscalls/Makefile index 556fe30a6c8f..283f64407b07 100644 --- a/arch/parisc/kernel/syscalls/Makefile +++ b/arch/parisc/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile index d609f0040b2a..9e3be295dbba 100644 --- a/arch/powerpc/kernel/syscalls/Makefile +++ b/arch/powerpc/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/sh/kernel/syscalls/Makefile b/arch/sh/kernel/syscalls/Makefile index 1c42d2d2926d..285aaba832d9 100644 --- a/arch/sh/kernel/syscalls/Makefile +++ b/arch/sh/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/sparc/kernel/syscalls/Makefile b/arch/sparc/kernel/syscalls/Makefile index 556fe30a6c8f..283f64407b07 100644 --- a/arch/sparc/kernel/syscalls/Makefile +++ b/arch/sparc/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index b0adc1704fde..d8c4f6c9eadc 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -6,8 +6,8 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') -syscall32 := $(srctree)/$(src)/syscall_32.tbl -syscall64 := $(srctree)/$(src)/syscall_64.tbl +syscall32 := $(src)/syscall_32.tbl +syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh diff --git a/arch/xtensa/kernel/syscalls/Makefile b/arch/xtensa/kernel/syscalls/Makefile index 1c42d2d2926d..285aaba832d9 100644 --- a/arch/xtensa/kernel/syscalls/Makefile +++ b/arch/xtensa/kernel/syscalls/Makefile @@ -5,7 +5,7 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') -syscall := $(srctree)/$(src)/syscall.tbl +syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh From 9df526b03c01ad98ed64e46c5e15b65fe89e25f6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Feb 2021 11:04:11 +0900 Subject: [PATCH 217/506] scripts: add generic syscalltbl.sh Most of architectures generate syscall headers at the compile time in a similar way. The syscall table has the same format for all architectures. Each line has up to 5 fields; syscall number, ABI, syscall name, native entry point, and compat entry point. The syscall table is processed by syscalltbl.sh script into header files. Despite the same pattern, scripts are maintained per architecture, which results in code duplication and bad maintainability. As of v5.11-rc1, 12 architectures duplicate similar shell scripts: $ find arch -name syscalltbl.sh | sort arch/alpha/kernel/syscalls/syscalltbl.sh arch/arm/tools/syscalltbl.sh arch/ia64/kernel/syscalls/syscalltbl.sh arch/m68k/kernel/syscalls/syscalltbl.sh arch/microblaze/kernel/syscalls/syscalltbl.sh arch/mips/kernel/syscalls/syscalltbl.sh arch/parisc/kernel/syscalls/syscalltbl.sh arch/powerpc/kernel/syscalls/syscalltbl.sh arch/sh/kernel/syscalls/syscalltbl.sh arch/sparc/kernel/syscalls/syscalltbl.sh arch/x86/entry/syscalls/syscalltbl.sh arch/xtensa/kernel/syscalls/syscalltbl.sh My goal is to unify them into scripts/syscalltbl.sh. __SYSCALL_WITH_COMPAT should be defined as follows: 32-bit kernel: #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) 64-bit kernel: #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) Signed-off-by: Masahiro Yamada --- scripts/syscalltbl.sh | 73 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 scripts/syscalltbl.sh diff --git a/scripts/syscalltbl.sh b/scripts/syscalltbl.sh new file mode 100755 index 000000000000..aa6ab156301c --- /dev/null +++ b/scripts/syscalltbl.sh @@ -0,0 +1,73 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only +# +# Generate a syscall table header. +# +# Each line of the syscall table should have the following format: +# +# NR ABI NAME [NATIVE] [COMPAT] +# +# NR syscall number +# ABI ABI name +# NAME syscall name +# NATIVE native entry point (optional) +# COMPAT compat entry point (optional) + +set -e + +usage() { + echo >&2 "usage: $0 [--abis ABIS] INFILE OUTFILE" >&2 + echo >&2 + echo >&2 " INFILE input syscall table" + echo >&2 " OUTFILE output header file" + echo >&2 + echo >&2 "options:" + echo >&2 " --abis ABIS ABI(s) to handle (By default, all lines are handled)" + exit 1 +} + +# default unless specified by options +abis= + +while [ $# -gt 0 ] +do + case $1 in + --abis) + abis=$(echo "($2)" | tr ',' '|') + shift 2;; + -*) + echo "$1: unknown option" >&2 + usage;; + *) + break;; + esac +done + +if [ $# -ne 2 ]; then + usage +fi + +infile="$1" +outfile="$2" + +nxt=0 + +grep -E "^[0-9]+[[:space:]]+$abis" "$infile" | sort -n | { + + while read nr abi name native compat ; do + + while [ $nxt -lt $nr ]; do + echo "__SYSCALL($nxt, sys_ni_syscall)" + nxt=$((nxt + 1)) + done + + if [ -n "$compat" ]; then + echo "__SYSCALL_WITH_COMPAT($nr, $native, $compat)" + elif [ -n "$native" ]; then + echo "__SYSCALL($nr, $native)" + else + echo "__SYSCALL($nr, sys_ni_syscall)" + fi + nxt=$((nr + 1)) + done +} > "$outfile" From b9da928abf45c8a9373a6f74765c8d9261dee8c1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Feb 2021 11:04:12 +0900 Subject: [PATCH 218/506] scripts: add generic syscallhdr.sh Most of architectures generate syscall headers at the compile time in a similar way. As of v5.11-rc1, 12 architectures duplicate similar shell scripts: $ find arch -name syscallhdr.sh | sort arch/alpha/kernel/syscalls/syscallhdr.sh arch/arm/tools/syscallhdr.sh arch/ia64/kernel/syscalls/syscallhdr.sh arch/m68k/kernel/syscalls/syscallhdr.sh arch/microblaze/kernel/syscalls/syscallhdr.sh arch/mips/kernel/syscalls/syscallhdr.sh arch/parisc/kernel/syscalls/syscallhdr.sh arch/powerpc/kernel/syscalls/syscallhdr.sh arch/sh/kernel/syscalls/syscallhdr.sh arch/sparc/kernel/syscalls/syscallhdr.sh arch/x86/entry/syscalls/syscallhdr.sh arch/xtensa/kernel/syscalls/syscallhdr.sh My goal is to unify them into scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada --- scripts/syscallhdr.sh | 98 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100755 scripts/syscallhdr.sh diff --git a/scripts/syscallhdr.sh b/scripts/syscallhdr.sh new file mode 100755 index 000000000000..848ac2735115 --- /dev/null +++ b/scripts/syscallhdr.sh @@ -0,0 +1,98 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only +# +# Generate a syscall number header. +# +# Each line of the syscall table should have the following format: +# +# NR ABI NAME [NATIVE] [COMPAT] +# +# NR syscall number +# ABI ABI name +# NAME syscall name +# NATIVE native entry point (optional) +# COMPAT compat entry point (optional) + +set -e + +usage() { + echo >&2 "usage: $0 [--abis ABIS] [--emit-nr] [--offset OFFSET] [--prefix PREFIX] INFILE OUTFILE" >&2 + echo >&2 + echo >&2 " INFILE input syscall table" + echo >&2 " OUTFILE output header file" + echo >&2 + echo >&2 "options:" + echo >&2 " --abis ABIS ABI(s) to handle (By default, all lines are handled)" + echo >&2 " --emit-nr Emit the macro of the number of syscalls (__NR_syscalls)" + echo >&2 " --offset OFFSET The offset of syscall numbers" + echo >&2 " --prefix PREFIX The prefix to the macro like __NR_" + exit 1 +} + +# default unless specified by options +abis= +emit_nr= +offset= +prefix= + +while [ $# -gt 0 ] +do + case $1 in + --abis) + abis=$(echo "($2)" | tr ',' '|') + shift 2;; + --emit-nr) + emit_nr=1 + shift 1;; + --offset) + offset=$2 + shift 2;; + --prefix) + prefix=$2 + shift 2;; + -*) + echo "$1: unknown option" >&2 + usage;; + *) + break;; + esac +done + +if [ $# -ne 2 ]; then + usage +fi + +infile="$1" +outfile="$2" + +guard=_UAPI_ASM_$(basename "$outfile" | + sed -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g') + +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+$abis" "$infile" | sort -n | { + echo "#ifndef $guard" + echo "#define $guard" + echo + + max=0 + while read nr abi name native compat ; do + + max=$nr + + if [ -n "$offset" ]; then + nr="($offset + $nr)" + fi + + echo "#define __NR_$prefix$name $nr" + done + + if [ -n "$emit_nr" ]; then + echo + echo "#ifdef __KERNEL__" + echo "#define __NR_${prefix}syscalls $(($max + 1))" + echo "#endif" + fi + + echo + echo "#endif /* $guard */" +} > "$outfile" From 05f6bbf2d714309607d5533f0265a95d037610b4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Feb 2021 12:10:03 +0900 Subject: [PATCH 219/506] kbuild: remove ld-version macro There is no direct user of ld-version; you can use CONFIG_LD_VERSION if needed. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/Kbuild.include | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 08e011175b4c..509e0856d653 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -141,13 +141,9 @@ cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || e # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) -# ld-version -# Note this is mainly for HJ Lu's 3 number binutil versions -ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh) - # ld-ifversion # Usage: $(call ld-ifversion, -ge, 22252, y) -ld-ifversion = $(shell [ $(ld-version) $(1) $(2) ] && echo $(3) || echo $(4)) +ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4)) ###### From 02aff85922043cf175ebbe5fc3430acfeaeb8393 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Feb 2021 12:10:04 +0900 Subject: [PATCH 220/506] kbuild: check the minimum linker version in Kconfig Unify the two scripts/ld-version.sh and scripts/lld-version.sh, and check the minimum linker version like scripts/cc-version.sh did. I tested this script for some corner cases reported in the past: - GNU ld version 2.25-15.fc23 as reported by commit 8083013fc320 ("ld-version: Fix it on Fedora") - GNU ld (GNU Binutils) 2.20.1.20100303 as reported by commit 0d61ed17dd30 ("ld-version: Drop the 4th and 5th version components") This script show an error message if the linker is too old: $ make LD=ld.lld-9 SYNC include/config/auto.conf *** *** Linker is too old. *** Your LLD version: 9.0.1 *** Minimum LLD version: 10.0.1 *** scripts/Kconfig.include:50: Sorry, this linker is not supported. make[2]: *** [scripts/kconfig/Makefile:71: syncconfig] Error 1 make[1]: *** [Makefile:600: syncconfig] Error 2 make: *** [Makefile:708: include/config/auto.conf] Error 2 I also moved the check for gold to this script, so gold is still rejected: $ make LD=gold SYNC include/config/auto.conf gold linker is not supported as it is not capable of linking the kernel proper. scripts/Kconfig.include:50: Sorry, this linker is not supported. make[2]: *** [scripts/kconfig/Makefile:71: syncconfig] Error 1 make[1]: *** [Makefile:600: syncconfig] Error 2 make: *** [Makefile:708: include/config/auto.conf] Error 2 Thanks to David Laight for suggesting shell script improvements. Signed-off-by: Masahiro Yamada Acked-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor --- MAINTAINERS | 1 - init/Kconfig | 21 +++++++---- scripts/Kconfig.include | 7 +++- scripts/ld-version.sh | 82 ++++++++++++++++++++++++++++++++++++----- scripts/lld-version.sh | 20 ---------- 5 files changed, 90 insertions(+), 41 deletions(-) delete mode 100755 scripts/lld-version.sh diff --git a/MAINTAINERS b/MAINTAINERS index df820969be1f..6b82ad6990b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4314,7 +4314,6 @@ C: irc://chat.freenode.net/clangbuiltlinux F: Documentation/kbuild/llvm.rst F: include/linux/compiler-clang.h F: scripts/clang-tools/ -F: scripts/lld-version.sh K: \b(?i:clang|llvm)\b CLEANCACHE API diff --git a/init/Kconfig b/init/Kconfig index 7bcfa24524c2..42b69ee29dca 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -33,24 +33,29 @@ config GCC_VERSION default $(cc-version) if CC_IS_GCC default 0 -config LD_VERSION - int - default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) - config CC_IS_CLANG def_bool $(success,test "$(cc-name)" = Clang) -config LD_IS_LLD - def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD) - config CLANG_VERSION int default $(cc-version) if CC_IS_CLANG default 0 +config LD_IS_BFD + def_bool $(success,test "$(ld-name)" = BFD) + +config LD_VERSION + int + default $(ld-version) if LD_IS_BFD + default 0 + +config LD_IS_LLD + def_bool $(success,test "$(ld-name)" = LLD) + config LLD_VERSION int - default $(shell,$(srctree)/scripts/lld-version.sh $(LD)) + default $(ld-version) if LD_IS_LLD + default 0 config CC_CAN_LINK bool diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include index 0228cb9c74aa..58fdb5308725 100644 --- a/scripts/Kconfig.include +++ b/scripts/Kconfig.include @@ -45,8 +45,11 @@ $(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not su cc-name := $(shell,set -- $(cc-info) && echo $1) cc-version := $(shell,set -- $(cc-info) && echo $2) -# Fail if the linker is gold as it's not capable of linking the kernel proper -$(error-if,$(success, $(LD) -v | grep -q gold), gold linker '$(LD)' not supported) +# Get the linker name, version, and error out if it is not supported. +ld-info := $(shell,$(srctree)/scripts/ld-version.sh $(LD)) +$(error-if,$(success,test -z "$(ld-info)"),Sorry$(comma) this linker is not supported.) +ld-name := $(shell,set -- $(ld-info) && echo $1) +ld-version := $(shell,set -- $(ld-info) && echo $2) # machine bit flags # $(m32-flag): -m32 if the compiler supports it, or an empty string otherwise. diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh index 0f8a2c0f9502..a463273509b5 100755 --- a/scripts/ld-version.sh +++ b/scripts/ld-version.sh @@ -1,11 +1,73 @@ -#!/usr/bin/awk -f +#!/bin/sh # SPDX-License-Identifier: GPL-2.0 -# extract linker version number from stdin and turn into single number - { - gsub(".*\\)", ""); - gsub(".*version ", ""); - gsub("-.*", ""); - split($1,a, "."); - print a[1]*10000 + a[2]*100 + a[3]; - exit - } +# +# Print the linker name and its version in a 5 or 6-digit form. +# Also, perform the minimum version check. + +set -e + +# When you raise the minimum linker version, please update +# Documentation/process/changes.rst as well. +bfd_min_version=2.23.0 +lld_min_version=10.0.1 + +# Convert the version string x.y.z to a canonical 5 or 6-digit form. +get_canonical_version() +{ + IFS=. + set -- $1 + + # If the 2nd or 3rd field is missing, fill it with a zero. + # + # The 4th field, if present, is ignored. + # This occurs in development snapshots as in 2.35.1.20201116 + echo $((10000 * $1 + 100 * ${2:-0} + ${3:-0})) +} + +orig_args="$@" + +# Get the first line of the --version output. +IFS=' +' +set -- $("$@" --version) + +# Split the line on spaces. +IFS=' ' +set -- $1 + +if [ "$1" = GNU -a "$2" = ld ]; then + shift $(($# - 1)) + version=$1 + min_version=$bfd_min_version + name=BFD + disp_name="GNU ld" +elif [ "$1" = GNU -a "$2" = gold ]; then + echo "gold linker is not supported as it is not capable of linking the kernel proper." >&2 + exit 1 +elif [ "$1" = LLD ]; then + version=$2 + min_version=$lld_min_version + name=LLD + disp_name=LLD +else + echo "$orig_args: unknown linker" >&2 + exit 1 +fi + +# Some distributions append a package release number, as in 2.34-4.fc32 +# Trim the hyphen and any characters that follow. +version=${version%-*} + +cversion=$(get_canonical_version $version) +min_cversion=$(get_canonical_version $min_version) + +if [ "$cversion" -lt "$min_cversion" ]; then + echo >&2 "***" + echo >&2 "*** Linker is too old." + echo >&2 "*** Your $disp_name version: $version" + echo >&2 "*** Minimum $disp_name version: $min_version" + echo >&2 "***" + exit 1 +fi + +echo $name $cversion diff --git a/scripts/lld-version.sh b/scripts/lld-version.sh deleted file mode 100755 index d70edb4d8a4f..000000000000 --- a/scripts/lld-version.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Usage: $ ./scripts/lld-version.sh ld.lld -# -# Print the linker version of `ld.lld' in a 5 or 6-digit form -# such as `100001' for ld.lld 10.0.1 etc. - -linker_string="$($* --version)" - -if ! ( echo $linker_string | grep -q LLD ); then - echo 0 - exit 1 -fi - -VERSION=$(echo $linker_string | cut -d ' ' -f 2) -MAJOR=$(echo $VERSION | cut -d . -f 1) -MINOR=$(echo $VERSION | cut -d . -f 2) -PATCHLEVEL=$(echo $VERSION | cut -d . -f 3) -printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL From 97ea656521c8e94c202d24f9d953cb65297f9aec Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 29 Jan 2021 00:49:32 +0000 Subject: [PATCH 221/506] drm/i915/gvt: Parse default state to update reg whitelist Rather than break existing context objects by incorrectly forcing them to rogue cache coherency and trying to assert a new mapping, read the reg whitelist from the default context image. And use gvt->gt, never &dev_priv->gt. Fixes: 493f30cd086e ("drm/i915/gvt: parse init context to update cmd accessible reg whitelist") Acked-by: Zhenyu Wang Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Kevin Tian Cc: Wang Zhi Cc: Yan Zhao Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20210129004933.29755-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 93 ++++++--------------------- 1 file changed, 20 insertions(+), 73 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index d54ea0e4681d..fef1e857cefc 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -41,6 +41,7 @@ #include "gt/intel_lrc.h" #include "gt/intel_ring.h" #include "gt/intel_gt_requests.h" +#include "gt/shmem_utils.h" #include "gvt.h" #include "i915_pvinfo.h" #include "trace.h" @@ -3094,71 +3095,28 @@ int intel_gvt_scan_and_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) */ void intel_gvt_update_reg_whitelist(struct intel_vgpu *vgpu) { + const unsigned long start = LRC_STATE_PN * PAGE_SIZE; struct intel_gvt *gvt = vgpu->gvt; - struct drm_i915_private *dev_priv = gvt->gt->i915; struct intel_engine_cs *engine; enum intel_engine_id id; - const unsigned long start = LRC_STATE_PN * PAGE_SIZE; - struct i915_request *rq; - struct intel_vgpu_submission *s = &vgpu->submission; - struct i915_request *requests[I915_NUM_ENGINES] = {}; - bool is_ctx_pinned[I915_NUM_ENGINES] = {}; - int ret = 0; if (gvt->is_reg_whitelist_updated) return; - for_each_engine(engine, &dev_priv->gt, id) { - ret = intel_context_pin(s->shadow[id]); - if (ret) { - gvt_vgpu_err("fail to pin shadow ctx\n"); - goto out; - } - is_ctx_pinned[id] = true; - - rq = i915_request_create(s->shadow[id]); - if (IS_ERR(rq)) { - gvt_vgpu_err("fail to alloc default request\n"); - ret = -EIO; - goto out; - } - requests[id] = i915_request_get(rq); - i915_request_add(rq); - } - - if (intel_gt_wait_for_idle(&dev_priv->gt, - I915_GEM_IDLE_TIMEOUT) == -ETIME) { - ret = -EIO; - goto out; - } - /* scan init ctx to update cmd accessible list */ - for_each_engine(engine, &dev_priv->gt, id) { - int size = engine->context_size - PAGE_SIZE; - void *vaddr; + for_each_engine(engine, gvt->gt, id) { struct parser_exec_state s; - struct drm_i915_gem_object *obj; - struct i915_request *rq; + void *vaddr; + int ret; - rq = requests[id]; - GEM_BUG_ON(!i915_request_completed(rq)); - GEM_BUG_ON(!intel_context_is_pinned(rq->context)); - obj = rq->context->state->obj; + if (!engine->default_state) + continue; - if (!obj) { - ret = -EIO; - goto out; - } - - i915_gem_object_set_cache_coherency(obj, - I915_CACHE_LLC); - - vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + vaddr = shmem_pin_map(engine->default_state); if (IS_ERR(vaddr)) { - gvt_err("failed to pin init ctx obj, ring=%d, err=%lx\n", - id, PTR_ERR(vaddr)); - ret = PTR_ERR(vaddr); - goto out; + gvt_err("failed to map %s->default state, err:%zd\n", + engine->name, PTR_ERR(vaddr)); + return; } s.buf_type = RING_BUFFER_CTX; @@ -3166,9 +3124,9 @@ void intel_gvt_update_reg_whitelist(struct intel_vgpu *vgpu) s.vgpu = vgpu; s.engine = engine; s.ring_start = 0; - s.ring_size = size; + s.ring_size = engine->context_size - start; s.ring_head = 0; - s.ring_tail = size; + s.ring_tail = s.ring_size; s.rb_va = vaddr + start; s.workload = NULL; s.is_ctx_wa = false; @@ -3176,29 +3134,18 @@ void intel_gvt_update_reg_whitelist(struct intel_vgpu *vgpu) /* skipping the first RING_CTX_SIZE(0x50) dwords */ ret = ip_gma_set(&s, RING_CTX_SIZE); - if (ret) { - i915_gem_object_unpin_map(obj); - goto out; + if (ret == 0) { + ret = command_scan(&s, 0, s.ring_size, 0, s.ring_size); + if (ret) + gvt_err("Scan init ctx error\n"); } - ret = command_scan(&s, 0, size, 0, size); + shmem_unpin_map(engine->default_state, vaddr); if (ret) - gvt_err("Scan init ctx error\n"); - - i915_gem_object_unpin_map(obj); + return; } -out: - if (!ret) - gvt->is_reg_whitelist_updated = true; - - for (id = 0; id < I915_NUM_ENGINES ; id++) { - if (requests[id]) - i915_request_put(requests[id]); - - if (is_ctx_pinned[id]) - intel_context_unpin(s->shadow[id]); - } + gvt->is_reg_whitelist_updated = true; } int intel_gvt_scan_engine_context(struct intel_vgpu_workload *workload) From d18ac1a7eef9ec266142b637253353a8d9f95cc1 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 29 Jan 2021 00:49:33 +0000 Subject: [PATCH 222/506] drm/i915/gvt: Purge dev_priv->gt Use the right intel_gt stored as a backpointer in intel_vgpu. Signed-off-by: Chris Wilson Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20210129004933.29755-2-chris@chris-wilson.co.uk Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/execlist.c | 8 +++----- drivers/gpu/drm/i915/gvt/scheduler.c | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/execlist.c b/drivers/gpu/drm/i915/gvt/execlist.c index 158873f269b1..c8dcda6d4f0d 100644 --- a/drivers/gpu/drm/i915/gvt/execlist.c +++ b/drivers/gpu/drm/i915/gvt/execlist.c @@ -522,12 +522,11 @@ static void init_vgpu_execlist(struct intel_vgpu *vgpu, static void clean_execlist(struct intel_vgpu *vgpu, intel_engine_mask_t engine_mask) { - struct drm_i915_private *dev_priv = vgpu->gvt->gt->i915; - struct intel_engine_cs *engine; struct intel_vgpu_submission *s = &vgpu->submission; + struct intel_engine_cs *engine; intel_engine_mask_t tmp; - for_each_engine_masked(engine, &dev_priv->gt, engine_mask, tmp) { + for_each_engine_masked(engine, vgpu->gvt->gt, engine_mask, tmp) { kfree(s->ring_scan_buffer[engine->id]); s->ring_scan_buffer[engine->id] = NULL; s->ring_scan_buffer_size[engine->id] = 0; @@ -537,11 +536,10 @@ static void clean_execlist(struct intel_vgpu *vgpu, static void reset_execlist(struct intel_vgpu *vgpu, intel_engine_mask_t engine_mask) { - struct drm_i915_private *dev_priv = vgpu->gvt->gt->i915; struct intel_engine_cs *engine; intel_engine_mask_t tmp; - for_each_engine_masked(engine, &dev_priv->gt, engine_mask, tmp) + for_each_engine_masked(engine, vgpu->gvt->gt, engine_mask, tmp) init_vgpu_execlist(vgpu, engine); } diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 43f31c2eab14..a55ae50dbbe1 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -1015,13 +1015,12 @@ void intel_vgpu_clean_workloads(struct intel_vgpu *vgpu, intel_engine_mask_t engine_mask) { struct intel_vgpu_submission *s = &vgpu->submission; - struct drm_i915_private *dev_priv = vgpu->gvt->gt->i915; struct intel_engine_cs *engine; struct intel_vgpu_workload *pos, *n; intel_engine_mask_t tmp; /* free the unsubmited workloads in the queues. */ - for_each_engine_masked(engine, &dev_priv->gt, engine_mask, tmp) { + for_each_engine_masked(engine, vgpu->gvt->gt, engine_mask, tmp) { list_for_each_entry_safe(pos, n, &s->workload_q_head[engine->id], list) { list_del_init(&pos->list); From 67f1120381df022a7016f4acc8d4880da9a66c03 Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Sun, 10 Jan 2021 23:43:05 +0200 Subject: [PATCH 223/506] drm/i915/gvt: Introduce per object locking in GVT scheduler. To support ww locking and per-object implemented in i915, GVT scheduler needs to be refined. Most of the changes are located in shadow batch buffer, shadow wa context in GVT-g, where use quite a lot of i915 gem object APIs. v2: - Adjust the usage of ww lock on context pin/unpin. (maarten) - Rebase the patch on the newest staging branch. Fixes: 6b05030496f7 ("drm/i915: Convert i915_gem_object/client_blt.c to use ww locking as well, v2.") Cc: Maarten Lankhorst Cc: Joonas Lahtinen Cc: Zhenyu Wang Signed-off-by: Zhi Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/1610314985-26065-1-git-send-email-zhi.wang.linux@gmail.com Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/scheduler.c | 49 +++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index a55ae50dbbe1..fc735692f21f 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -412,7 +412,9 @@ static void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) if (!wa_ctx->indirect_ctx.obj) return; + i915_gem_object_lock(wa_ctx->indirect_ctx.obj, NULL); i915_gem_object_unpin_map(wa_ctx->indirect_ctx.obj); + i915_gem_object_unlock(wa_ctx->indirect_ctx.obj); i915_gem_object_put(wa_ctx->indirect_ctx.obj); wa_ctx->indirect_ctx.obj = NULL; @@ -520,6 +522,7 @@ static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload) struct intel_gvt *gvt = workload->vgpu->gvt; const int gmadr_bytes = gvt->device_info.gmadr_bytes_in_cmd; struct intel_vgpu_shadow_bb *bb; + struct i915_gem_ww_ctx ww; int ret; list_for_each_entry(bb, &workload->shadow_bb, list) { @@ -544,10 +547,19 @@ static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload) * directly */ if (!bb->ppgtt) { - bb->vma = i915_gem_object_ggtt_pin(bb->obj, - NULL, 0, 0, 0); + i915_gem_ww_ctx_init(&ww, false); +retry: + i915_gem_object_lock(bb->obj, &ww); + + bb->vma = i915_gem_object_ggtt_pin_ww(bb->obj, &ww, + NULL, 0, 0, 0); if (IS_ERR(bb->vma)) { ret = PTR_ERR(bb->vma); + if (ret == -EDEADLK) { + ret = i915_gem_ww_ctx_backoff(&ww); + if (!ret) + goto retry; + } goto err; } @@ -561,13 +573,15 @@ static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload) 0); if (ret) goto err; - } - /* No one is going to touch shadow bb from now on. */ - i915_gem_object_flush_map(bb->obj); + /* No one is going to touch shadow bb from now on. */ + i915_gem_object_flush_map(bb->obj); + i915_gem_object_unlock(bb->obj); + } } return 0; err: + i915_gem_ww_ctx_fini(&ww); release_shadow_batch_buffer(workload); return ret; } @@ -594,14 +608,29 @@ static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) unsigned char *per_ctx_va = (unsigned char *)wa_ctx->indirect_ctx.shadow_va + wa_ctx->indirect_ctx.size; + struct i915_gem_ww_ctx ww; + int ret; if (wa_ctx->indirect_ctx.size == 0) return 0; - vma = i915_gem_object_ggtt_pin(wa_ctx->indirect_ctx.obj, NULL, - 0, CACHELINE_BYTES, 0); - if (IS_ERR(vma)) - return PTR_ERR(vma); + i915_gem_ww_ctx_init(&ww, false); +retry: + i915_gem_object_lock(wa_ctx->indirect_ctx.obj, &ww); + + vma = i915_gem_object_ggtt_pin_ww(wa_ctx->indirect_ctx.obj, &ww, NULL, + 0, CACHELINE_BYTES, 0); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + if (ret == -EDEADLK) { + ret = i915_gem_ww_ctx_backoff(&ww); + if (!ret) + goto retry; + } + return ret; + } + + i915_gem_object_unlock(wa_ctx->indirect_ctx.obj); /* FIXME: we are not tracking our pinned VMA leaving it * up to the core to fix up the stray pin_count upon @@ -635,12 +664,14 @@ static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload) list_for_each_entry_safe(bb, pos, &workload->shadow_bb, list) { if (bb->obj) { + i915_gem_object_lock(bb->obj, NULL); if (bb->va && !IS_ERR(bb->va)) i915_gem_object_unpin_map(bb->obj); if (bb->vma && !IS_ERR(bb->vma)) i915_vma_unpin(bb->vma); + i915_gem_object_unlock(bb->obj); i915_gem_object_put(bb->obj); } list_del(&bb->list); From 2596b6ae412be3d29632efc63976a2132032e620 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 19 Feb 2021 14:51:42 -0500 Subject: [PATCH 224/506] kexec: move machine_kexec_post_load() to public interface The kernel test robot reports the following compiler warning: | arch/arm64/kernel/machine_kexec.c:62:5: warning: no previous prototype for | function 'machine_kexec_post_load' [-Wmissing-prototypes] | int machine_kexec_post_load(struct kimage *kimage) Fix it by moving the declaration of machine_kexec_post_load() from kexec_internal.h to the public header instead. Reported-by: kernel test robot Link: https://lore.kernel.org/linux-arm-kernel/202102030727.gqTokACH-lkp@intel.com Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210219195142.13571-1-pasha.tatashin@soleen.com Fixes: 4c3c31230c91 ("arm64: kexec: move relocation function setup") Signed-off-by: Will Deacon --- include/linux/kexec.h | 2 ++ kernel/kexec_internal.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9e93bef52968..3671b845cf28 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -309,6 +309,8 @@ extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +int machine_kexec_post_load(struct kimage *image); + extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 39d30ccf8d87..48aaf2ac0d0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -int machine_kexec_post_load(struct kimage *image); - extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE From 72d6b2459dbd539c1369149e501fdc3dc8ddef16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 15 Jan 2021 08:32:39 +0100 Subject: [PATCH 225/506] pwm: iqs620a: Fix overflow and optimize calculations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If state->duty_cycle is 0x100000000000000, the previous calculation of duty_scale overflows and yields a duty cycle ratio of 0% instead of 100%. Fix this by clamping the requested duty cycle to the maximal possible duty cycle first. This way it is possible to use a native integer division instead of a (depending on the architecture) more expensive 64bit division. With this change in place duty_scale cannot be bigger than 256 which allows to simplify the calculation of duty_val. Fixes: 6f0841a8197b ("pwm: Add support for Azoteq IQS620A PWM generator") Tested-by: Jeff LaBundy Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-iqs620a.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-iqs620a.c b/drivers/pwm/pwm-iqs620a.c index 5ede8255926e..14b18fb4f527 100644 --- a/drivers/pwm/pwm-iqs620a.c +++ b/drivers/pwm/pwm-iqs620a.c @@ -46,7 +46,8 @@ static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, { struct iqs620_pwm_private *iqs620_pwm; struct iqs62x_core *iqs62x; - u64 duty_scale; + unsigned int duty_cycle; + unsigned int duty_scale; int ret; if (state->polarity != PWM_POLARITY_NORMAL) @@ -70,7 +71,8 @@ static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, * For lower duty cycles (e.g. 0), the PWM output is simply disabled to * allow an external pull-down resistor to hold the GPIO3/LTX pin low. */ - duty_scale = div_u64(state->duty_cycle * 256, IQS620_PWM_PERIOD_NS); + duty_cycle = min_t(u64, state->duty_cycle, IQS620_PWM_PERIOD_NS); + duty_scale = duty_cycle * 256 / IQS620_PWM_PERIOD_NS; mutex_lock(&iqs620_pwm->lock); @@ -82,7 +84,7 @@ static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, } if (duty_scale) { - u8 duty_val = min_t(u64, duty_scale - 1, 0xff); + u8 duty_val = duty_scale - 1; ret = regmap_write(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, duty_val); From 28208c7b4a2c38ea91b6ee04f6023d3145257f5d Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 18 Jan 2021 22:30:29 -0600 Subject: [PATCH 226/506] pwm: iqs620a: Correct a stale state variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If duty cycle is first set to a value that is sufficiently high to enable the output (e.g. 10000 ns) but then lowered to a value that is quantized to zero (e.g. 1000 ns), the output is disabled as the device cannot drive a constant zero (as expected). However if the device is later re-initialized due to watchdog bite, the output is re-enabled at the next-to-last duty cycle (10000 ns). This is because the iqs620_pwm->out_en flag unconditionally tracks state->enabled instead of what was actually written to the device. To solve this problem, use one state variable that encodes all 257 states of the output (duty_scale) with 0 representing tri-state, 1 representing the minimum available duty cycle and 256 representing 100% duty cycle. Signed-off-by: Jeff LaBundy Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-iqs620a.c | 88 ++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 51 deletions(-) diff --git a/drivers/pwm/pwm-iqs620a.c b/drivers/pwm/pwm-iqs620a.c index 14b18fb4f527..957b972c458b 100644 --- a/drivers/pwm/pwm-iqs620a.c +++ b/drivers/pwm/pwm-iqs620a.c @@ -37,15 +37,32 @@ struct iqs620_pwm_private { struct pwm_chip chip; struct notifier_block notifier; struct mutex lock; - bool out_en; - u8 duty_val; + unsigned int duty_scale; }; +static int iqs620_pwm_init(struct iqs620_pwm_private *iqs620_pwm, + unsigned int duty_scale) +{ + struct iqs62x_core *iqs62x = iqs620_pwm->iqs62x; + int ret; + + if (!duty_scale) + return regmap_update_bits(iqs62x->regmap, IQS620_PWR_SETTINGS, + IQS620_PWR_SETTINGS_PWM_OUT, 0); + + ret = regmap_write(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, + duty_scale - 1); + if (ret) + return ret; + + return regmap_update_bits(iqs62x->regmap, IQS620_PWR_SETTINGS, + IQS620_PWR_SETTINGS_PWM_OUT, 0xff); +} + static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { struct iqs620_pwm_private *iqs620_pwm; - struct iqs62x_core *iqs62x; unsigned int duty_cycle; unsigned int duty_scale; int ret; @@ -57,7 +74,6 @@ static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return -EINVAL; iqs620_pwm = container_of(chip, struct iqs620_pwm_private, chip); - iqs62x = iqs620_pwm->iqs62x; /* * The duty cycle generated by the device is calculated as follows: @@ -74,36 +90,15 @@ static int iqs620_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, duty_cycle = min_t(u64, state->duty_cycle, IQS620_PWM_PERIOD_NS); duty_scale = duty_cycle * 256 / IQS620_PWM_PERIOD_NS; + if (!state->enabled) + duty_scale = 0; + mutex_lock(&iqs620_pwm->lock); - if (!state->enabled || !duty_scale) { - ret = regmap_update_bits(iqs62x->regmap, IQS620_PWR_SETTINGS, - IQS620_PWR_SETTINGS_PWM_OUT, 0); - if (ret) - goto err_mutex; - } + ret = iqs620_pwm_init(iqs620_pwm, duty_scale); + if (!ret) + iqs620_pwm->duty_scale = duty_scale; - if (duty_scale) { - u8 duty_val = duty_scale - 1; - - ret = regmap_write(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, - duty_val); - if (ret) - goto err_mutex; - - iqs620_pwm->duty_val = duty_val; - } - - if (state->enabled && duty_scale) { - ret = regmap_update_bits(iqs62x->regmap, IQS620_PWR_SETTINGS, - IQS620_PWR_SETTINGS_PWM_OUT, 0xff); - if (ret) - goto err_mutex; - } - - iqs620_pwm->out_en = state->enabled; - -err_mutex: mutex_unlock(&iqs620_pwm->lock); return ret; @@ -121,12 +116,11 @@ static void iqs620_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, /* * Since the device cannot generate a 0% duty cycle, requests to do so * cause subsequent calls to iqs620_pwm_get_state to report the output - * as disabled with duty cycle equal to that which was in use prior to - * the request. This is not ideal, but is the best compromise based on + * as disabled. This is not ideal, but is the best compromise based on * the capabilities of the device. */ - state->enabled = iqs620_pwm->out_en; - state->duty_cycle = DIV_ROUND_UP((iqs620_pwm->duty_val + 1) * + state->enabled = iqs620_pwm->duty_scale > 0; + state->duty_cycle = DIV_ROUND_UP(iqs620_pwm->duty_scale * IQS620_PWM_PERIOD_NS, 256); mutex_unlock(&iqs620_pwm->lock); @@ -138,7 +132,6 @@ static int iqs620_pwm_notifier(struct notifier_block *notifier, unsigned long event_flags, void *context) { struct iqs620_pwm_private *iqs620_pwm; - struct iqs62x_core *iqs62x; int ret; if (!(event_flags & BIT(IQS62X_EVENT_SYS_RESET))) @@ -146,7 +139,6 @@ static int iqs620_pwm_notifier(struct notifier_block *notifier, iqs620_pwm = container_of(notifier, struct iqs620_pwm_private, notifier); - iqs62x = iqs620_pwm->iqs62x; mutex_lock(&iqs620_pwm->lock); @@ -155,16 +147,8 @@ static int iqs620_pwm_notifier(struct notifier_block *notifier, * of a device reset, so nothing else is printed here unless there is * an additional failure. */ - ret = regmap_write(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, - iqs620_pwm->duty_val); - if (ret) - goto err_mutex; + ret = iqs620_pwm_init(iqs620_pwm, iqs620_pwm->duty_scale); - ret = regmap_update_bits(iqs62x->regmap, IQS620_PWR_SETTINGS, - IQS620_PWR_SETTINGS_PWM_OUT, - iqs620_pwm->out_en ? 0xff : 0); - -err_mutex: mutex_unlock(&iqs620_pwm->lock); if (ret) { @@ -211,12 +195,14 @@ static int iqs620_pwm_probe(struct platform_device *pdev) ret = regmap_read(iqs62x->regmap, IQS620_PWR_SETTINGS, &val); if (ret) return ret; - iqs620_pwm->out_en = val & IQS620_PWR_SETTINGS_PWM_OUT; - ret = regmap_read(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, &val); - if (ret) - return ret; - iqs620_pwm->duty_val = val; + if (val & IQS620_PWR_SETTINGS_PWM_OUT) { + ret = regmap_read(iqs62x->regmap, IQS620_PWM_DUTY_CYCLE, &val); + if (ret) + return ret; + + iqs620_pwm->duty_scale = val + 1; + } iqs620_pwm->chip.dev = &pdev->dev; iqs620_pwm->chip.ops = &iqs620_pwm_ops; From 9a9dd7e473517b68412fd2da3da8a4aeb4ecb38a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 3 Feb 2021 10:50:05 +0800 Subject: [PATCH 227/506] pwm: lpc18xx-sct: remove unneeded semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate the following coccicheck warning: ./drivers/pwm/pwm-lpc18xx-sct.c:292:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Uwe Kleine-König Acked-by: Vladimir Zapolskiy Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpc18xx-sct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c index dc5133bec3e7..7ef40243eb6c 100644 --- a/drivers/pwm/pwm-lpc18xx-sct.c +++ b/drivers/pwm/pwm-lpc18xx-sct.c @@ -289,7 +289,7 @@ static int lpc18xx_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) dev_err(lpc18xx_pwm->dev, "maximum number of simultaneous channels reached\n"); return -EBUSY; - }; + } set_bit(event, &lpc18xx_pwm->event_map); lpc18xx_data->duty_event = event; From bfa5782b9caa26f93f42ad79804e1f75a1ce9f18 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 21 Feb 2021 19:28:53 -0800 Subject: [PATCH 228/506] fbdev: atyfb: add stubs for aty_{ld,st}_lcd() Fix build errors when these functions are not defined. ../drivers/video/fbdev/aty/atyfb_base.c: In function 'aty_power_mgmt': ../drivers/video/fbdev/aty/atyfb_base.c:2002:7: error: implicit declaration of function 'aty_ld_lcd'; did you mean 'aty_ld_8'? [-Werror=implicit-function-declaration] 2002 | pm = aty_ld_lcd(POWER_MANAGEMENT, par); ../drivers/video/fbdev/aty/atyfb_base.c:2004:2: error: implicit declaration of function 'aty_st_lcd'; did you mean 'aty_st_8'? [-Werror=implicit-function-declaration] 2004 | aty_st_lcd(POWER_MANAGEMENT, pm, par); Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Cc: Daniel Vetter Cc: David Airlie Cc: Jani Nikula Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210222032853.21483-1-rdunlap@infradead.org --- drivers/video/fbdev/aty/atyfb_base.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index 83c8e809955a..e946903a86c2 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -175,6 +175,15 @@ u32 aty_ld_lcd(int index, const struct atyfb_par *par) return aty_ld_le32(LCD_DATA, par); } } +#else /* defined(CONFIG_PMAC_BACKLIGHT) || defined(CONFIG_FB_ATY_BACKLIGHT) \ + defined(CONFIG_FB_ATY_GENERIC_LCD) */ +void aty_st_lcd(int index, u32 val, const struct atyfb_par *par) +{ } + +u32 aty_ld_lcd(int index, const struct atyfb_par *par) +{ + return 0; +} #endif /* defined(CONFIG_PMAC_BACKLIGHT) || defined (CONFIG_FB_ATY_GENERIC_LCD) */ #ifdef CONFIG_FB_ATY_GENERIC_LCD From d2df592fd8c6c14a43e08314a91101d60b32da01 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Feb 2021 09:55:39 -0500 Subject: [PATCH 229/506] KVM: nSVM: prepare guest save area while is_guest_mode is true Right now, enter_svm_guest_mode is calling nested_prepare_vmcb_save and nested_prepare_vmcb_control. This results in is_guest_mode being false until the end of nested_prepare_vmcb_control. This is a problem because nested_prepare_vmcb_save can in turn cause changes to the intercepts and these have to be applied to the "host VMCB" (stored in svm->nested.hsave) and then merged with the VMCB12 intercepts into svm->vmcb. In particular, without this change we forget to set the CR0 read and CR0 write intercepts when running a real mode L2 guest with NPT disabled. The guest is therefore able to see the CR0.PG bit that KVM sets to enable "paged real mode". This patch fixes the svm.flat mode_switch test case with npt=0. There are no other problematic calls in nested_prepare_vmcb_save. Moving is_guest_mode to the end is done since commit 06fc7772690d ("KVM: SVM: Activate nested state only when guest state is complete", 2010-04-25). However, back then KVM didn't grab a different VMCB when updating the intercepts, it had already copied/merged L1's stuff to L0's VMCB, and then updated L0's VMCB regardless of is_nested(). Later recalc_intercepts was introduced in commit 384c63684397 ("KVM: SVM: Add function to recalculate intercept masks", 2011-01-12). This introduced the bug, because recalc_intercepts now throws away the intercept manipulations that svm_set_cr0 had done in the meanwhile to svm->vmcb. [1] https://lore.kernel.org/kvm/1266493115-28386-1-git-send-email-joerg.roedel@amd.com/ Reviewed-by: Sean Christopherson Tested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 92d3aaaac612..35891d9a1099 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -469,8 +469,8 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; load_nested_vmcb_control(svm, &vmcb12->control); - nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); + nested_prepare_vmcb_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); From 356c7558d453338c9184809c0926071dfbfb9c80 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 4 Jan 2021 10:59:38 +0100 Subject: [PATCH 230/506] KVM: Documentation: rectify rst markup in KVM_GET_SUPPORTED_HV_CPUID Commit c21d54f0307f ("KVM: x86: hyper-v: allow KVM_GET_SUPPORTED_HV_CPUID as a system ioctl") added an enumeration in the KVM_GET_SUPPORTED_HV_CPUID documentation improperly for rst, and caused new warnings in make htmldocs: Documentation/virt/kvm/api.rst:4536: WARNING: Unexpected indentation. Documentation/virt/kvm/api.rst:4538: WARNING: Block quote ends without a blank line; unexpected unindent. Fix that issue and another historic rst markup issue from the initial rst conversion in the KVM_GET_SUPPORTED_HV_CPUID documentation. Signed-off-by: Lukas Bulwahn Message-Id: <20210104095938.24838-1-lukas.bulwahn@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 45fd862ac128..aed52b0fc16e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4519,6 +4519,7 @@ KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature leaves (0x40000000, 0x40000001). Currently, the following list of CPUID leaves are returned: + - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS - HYPERV_CPUID_INTERFACE - HYPERV_CPUID_VERSION @@ -4543,6 +4544,7 @@ userspace should not expect to get any particular value there. Note, vcpu version of KVM_GET_SUPPORTED_HV_CPUID is currently deprecated. Unlike system ioctl which exposes all supported feature bits unconditionally, vcpu version has the following quirks: + - HYPERV_CPUID_NESTED_FEATURES leaf and HV_X64_ENLIGHTENED_VMCS_RECOMMENDED feature bit are only exposed when Enlightened VMCS was previously enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). From 5f8a7cf25a7da5c2bbde25b3f0aca31459d20741 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Feb 2021 11:45:21 +0900 Subject: [PATCH 231/506] KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault Don't retry a page fault due to an mmu_notifier invalidation when handling a page fault for a GPA that did not resolve to a memslot, i.e. an MMIO page fault. Invalidations from the mmu_notifier signal a change in a host virtual address (HVA) mapping; without a memslot, there is no HVA and thus no possibility that the invalidation is relevant to the page fault being handled. Note, the MMIO vs. memslot generation checks handle the case where a pending memslot will create a memslot overlapping the faulting GPA. The mmu_notifier checks are orthogonal to memslot updates. Signed-off-by: Sean Christopherson Message-Id: <20210222024522.1751719-2-stevensd@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e78d8e51aa2..752b4b7ab01b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3712,7 +3712,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index d9f66cc459e8..5844d3979bb8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -869,7 +869,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); From 4a42d848db9544e3108875390886dc490d9c101e Mon Sep 17 00:00:00 2001 From: David Stevens Date: Mon, 22 Feb 2021 11:45:22 +0900 Subject: [PATCH 232/506] KVM: x86/mmu: Consider the hva in mmu_notifier retry Track the range being invalidated by mmu_notifier and skip page fault retries if the fault address is not affected by the in-progress invalidation. Handle concurrent invalidations by finding the minimal range which includes all ranges being invalidated. Although the combined range may include unrelated addresses and cannot be shrunk as individual invalidation operations complete, it is unlikely the marginal gains of proper range tracking are worth the additional complexity. The primary benefit of this change is the reduction in the likelihood of extreme latency when handing a page fault due to another thread having been preempted while modifying host virtual addresses. Signed-off-by: David Stevens Message-Id: <20210222024522.1751719-3-stevensd@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 23 ++++++++++++++------ arch/x86/kvm/mmu/paging_tmpl.h | 14 ++++++++++--- include/linux/kvm_host.h | 25 +++++++++++++++++++++- virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++---- 6 files changed, 79 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 38ea396a23d6..8e06cd3f759c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); + writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index bb35490400e9..e603de7ade52 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); + writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 752b4b7ab01b..d75524bc8423 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2734,6 +2734,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + __direct_pte_prefetch(vcpu, sp, sptep); } @@ -3640,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, + bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3654,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) return false; /* *pfn has correct page already */ @@ -3668,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; } @@ -3681,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; + hva_t hva; int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) @@ -3699,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) @@ -3712,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5844d3979bb8..55d7b473ac44 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); @@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; + hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; int max_level; @@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e126ebda36d0..1b65e7204344 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -506,6 +507,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + unsigned long mmu_notifier_range_start; + unsigned long mmu_notifier_range_end; #endif long tlbs_dirty; struct list_head devices; @@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable); + bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); @@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) return 1; return 0; } + +static inline int mmu_notifier_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) +{ + lockdep_assert_held(&kvm->mmu_lock); + /* + * If mmu_notifier_count is non-zero, then the range maintained by + * kvm_mmu_notifier_invalidate_range_start contains all addresses that + * might be being invalidated. Note that it may include some false + * positives, due to shortcuts when handing concurrent invalidations. + */ + if (unlikely(kvm->mmu_notifier_count) && + hva >= kvm->mmu_notifier_range_start && + hva < kvm->mmu_notifier_range_end) + return 1; + if (kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 001b9de4e727..383df23514b9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * count is also read inside the mmu_lock critical section. */ kvm->mmu_notifier_count++; + if (likely(kvm->mmu_notifier_count == 1)) { + kvm->mmu_notifier_range_start = range->start; + kvm->mmu_notifier_range_end = range->end; + } else { + /* + * Fully tracking multiple concurrent ranges has dimishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_notifier_range_start = + min(kvm->mmu_notifier_range_start, range->start); + kvm->mmu_notifier_range_end = + max(kvm->mmu_notifier_range_end, range->end); + } need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ @@ -2023,10 +2041,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable) + bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + if (hva) + *hva = addr; + if (addr == KVM_HVA_ERR_RO_BAD) { if (writable) *writable = false; @@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable); + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); From 7dcf7aa01c7b9f18727cbe0f9cb4136f1c6cdcc2 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Wed, 2 Sep 2020 08:53:44 -0700 Subject: [PATCH 233/506] ice: report correct max number of TCs In the driver currently, we are reporting max number of TCs to the DCBNL callback as a kernel define set to 8. This is preventing userspace applications performing DCBx to correctly down map the TCs from requested to actual values. Report the actual max TC value to userspace from the capability struct. Fixes: b94b013eb626 ("ice: Implement DCBNL support") Signed-off-by: Dave Ertman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index fcfefad00d1c..299bf7edbf82 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -134,7 +134,7 @@ ice_dcbnl_getnumtcs(struct net_device *dev, int __always_unused tcid, u8 *num) if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags)) return -EINVAL; - *num = IEEE_8021QAZ_MAX_TCS; + *num = pf->hw.func_caps.common_cap.maxtc; return 0; } From 37b52be260024069f7f5bdcf304b5d72f77b022a Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 17 Sep 2020 13:13:37 -0700 Subject: [PATCH 234/506] ice: Set trusted VF as default VSI when setting allmulti on Currently the PF will only set a trusted VF as the default VSI when it requests FLAG_VF_UNICAST_PROMISC over VIRTCHNL. However, when FLAG_VF_MULTICAST_PROMISC is set it's expected that the trusted VF will see multicast packets that don't have a matching destination MAC in the devices internal switch. Fix this by setting the trusted VF as the default VSI if either FLAG_VF_UNICAST_PROMISC or FLAG_VF_MULTICAST_PROMISC is set. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index bf5fd812ea0e..07fae37a78be 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2420,7 +2420,7 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg) } if (!test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags)) { - bool set_dflt_vsi = !!(info->flags & FLAG_VF_UNICAST_PROMISC); + bool set_dflt_vsi = alluni || allmulti; if (set_dflt_vsi && !ice_is_dflt_vsi_in_use(pf->first_sw)) /* only attempt to set the default forwarding VSI if From a6aa7c8f998f4afddd73410aa043dad38162ce9e Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 17 Sep 2020 13:13:38 -0700 Subject: [PATCH 235/506] ice: Account for port VLAN in VF max packet size calculation Currently if an AVF driver doesn't account for the possibility of a port VLAN when determining its max packet size then packets at MTU will be dropped. It is not the VF driver's responsibility to account for a port VLAN so fix this. To fix this, do the following: 1. Add a function that determines the max packet size a VF is allowed by using the port's max packet size and whether the VF is in a port VLAN. If a port VLAN is configured then a VF's max packet size will always be the port's max packet size minus VLAN_HLEN. Otherwise it will be the port's max packet size. 2. Use this function to verify the max packet size from the VF. 3. If there is a port VLAN configured then add 4 bytes (VLAN_HLEN) to the VF's max packet size configuration. Also, the VIRTCHNL_OP_GET_VF_RESOURCES message provides the capability to communicate a VF's max packet size. Use the new function for this purpose. Fixes: 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_virtchnl_pf.c | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 07fae37a78be..1f38a8d0c525 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1918,6 +1918,29 @@ static int ice_vc_get_ver_msg(struct ice_vf *vf, u8 *msg) sizeof(struct virtchnl_version_info)); } +/** + * ice_vc_get_max_frame_size - get max frame size allowed for VF + * @vf: VF used to determine max frame size + * + * Max frame size is determined based on the current port's max frame size and + * whether a port VLAN is configured on this VF. The VF is not aware whether + * it's in a port VLAN so the PF needs to account for this in max frame size + * checks and sending the max frame size to the VF. + */ +static u16 ice_vc_get_max_frame_size(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + struct ice_port_info *pi = vsi->port_info; + u16 max_frame_size; + + max_frame_size = pi->phy.link_info.max_frame_size; + + if (vf->port_vlan_info) + max_frame_size -= VLAN_HLEN; + + return max_frame_size; +} + /** * ice_vc_get_vf_res_msg * @vf: pointer to the VF info @@ -2000,6 +2023,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) vfres->max_vectors = pf->num_msix_per_vf; vfres->rss_key_size = ICE_VSIQF_HKEY_ARRAY_SIZE; vfres->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE; + vfres->max_mtu = ice_vc_get_max_frame_size(vf); vfres->vsi_res[0].vsi_id = vf->lan_vsi_num; vfres->vsi_res[0].vsi_type = VIRTCHNL_VSI_SRIOV; @@ -2998,6 +3022,8 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) /* copy Rx queue info from VF into VSI */ if (qpi->rxq.ring_len > 0) { + u16 max_frame_size = ice_vc_get_max_frame_size(vf); + num_rxq++; vsi->rx_rings[i]->dma = qpi->rxq.dma_ring_addr; vsi->rx_rings[i]->count = qpi->rxq.ring_len; @@ -3010,7 +3036,7 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) } vsi->rx_buf_len = qpi->rxq.databuffer_size; vsi->rx_rings[i]->rx_buf_len = vsi->rx_buf_len; - if (qpi->rxq.max_pkt_size >= (16 * 1024) || + if (qpi->rxq.max_pkt_size > max_frame_size || qpi->rxq.max_pkt_size < 64) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -3018,6 +3044,11 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) } vsi->max_frame = qpi->rxq.max_pkt_size; + /* add space for the port VLAN since the VF driver is not + * expected to account for it in the MTU calculation + */ + if (vf->port_vlan_info) + vsi->max_frame += VLAN_HLEN; } /* VF can request to configure less than allocated queues or default From 0d4907f65dc8fc5e897ad19956fca1acb3b33bc8 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Fri, 20 Nov 2020 16:38:35 -0800 Subject: [PATCH 236/506] ice: Fix state bits on LLDP mode switch DCBX_CAP bits were not being adjusted when switching between SW and FW controlled LLDP. Adjust bits to correctly indicate which mode the LLDP logic is in. Fixes: b94b013eb626 ("ice: Implement DCBNL support") Signed-off-by: Dave Ertman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 -- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 4 ++++ drivers/net/ethernet/intel/ice/ice_ethtool.c | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index dae8280ce17c..357706444dd5 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -454,9 +454,7 @@ struct ice_pf { struct ice_hw_port_stats stats_prev; struct ice_hw hw; u8 stat_prev_loaded:1; /* has previous stats been loaded */ -#ifdef CONFIG_DCB u16 dcbx_cap; -#endif /* CONFIG_DCB */ u32 tx_timeout_count; unsigned long tx_timeout_last_recovery; u32 tx_timeout_recovery_level; diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 299bf7edbf82..468a63f7eff9 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -159,6 +159,10 @@ static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode) struct ice_pf *pf = ice_netdev_to_pf(netdev); struct ice_qos_cfg *qos_cfg; + /* if FW LLDP agent is running, DCBNL not allowed to change mode */ + if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) + return ICE_DCB_NO_HW_CHG; + /* No support for LLD_MANAGED modes or CEE+IEEE */ if ((mode & DCB_CAP_DCBX_LLD_MANAGED) || ((mode & DCB_CAP_DCBX_VER_IEEE) && (mode & DCB_CAP_DCBX_VER_CEE)) || diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 5636c9b23896..4001857788f8 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -8,6 +8,7 @@ #include "ice_fltr.h" #include "ice_lib.h" #include "ice_dcb_lib.h" +#include struct ice_stats { char stat_string[ETH_GSTRING_LEN]; @@ -1238,6 +1239,9 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) status = ice_init_pf_dcb(pf, true); if (status) dev_warn(dev, "Fail to init DCB\n"); + + pf->dcbx_cap &= ~DCB_CAP_DCBX_LLD_MANAGED; + pf->dcbx_cap |= DCB_CAP_DCBX_HOST; } else { enum ice_status status; bool dcbx_agent_status; @@ -1280,6 +1284,9 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) if (status) dev_dbg(dev, "Fail to enable MIB change events\n"); + pf->dcbx_cap &= ~DCB_CAP_DCBX_HOST; + pf->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED; + ice_nway_reset(netdev); } } From 0393e46ac48a6832b1011c233ebcef84f8dbe4f5 Mon Sep 17 00:00:00 2001 From: Henry Tieman Date: Thu, 3 Dec 2020 09:20:24 -0800 Subject: [PATCH 237/506] ice: update the number of available RSS queues It was possible to have Rx queues that were not available for use by RSS. This would happen when increasing the number of Rx queues while there was a user defined RSS LUT. Always update the number of available RSS queues when changing the number of Rx queues. Fixes: 87324e747fde ("ice: Implement ethtool ops for channels") Signed-off-by: Henry Tieman Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 27 ++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4001857788f8..2dcfa685b763 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3328,6 +3328,18 @@ ice_get_channels(struct net_device *dev, struct ethtool_channels *ch) ch->max_other = ch->other_count; } +/** + * ice_get_valid_rss_size - return valid number of RSS queues + * @hw: pointer to the HW structure + * @new_size: requested RSS queues + */ +static int ice_get_valid_rss_size(struct ice_hw *hw, int new_size) +{ + struct ice_hw_common_caps *caps = &hw->func_caps.common_cap; + + return min_t(int, new_size, BIT(caps->rss_table_entry_width)); +} + /** * ice_vsi_set_dflt_rss_lut - set default RSS LUT with requested RSS size * @vsi: VSI to reconfigure RSS LUT on @@ -3355,14 +3367,10 @@ static int ice_vsi_set_dflt_rss_lut(struct ice_vsi *vsi, int req_rss_size) return -ENOMEM; /* set RSS LUT parameters */ - if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) { + if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) vsi->rss_size = 1; - } else { - struct ice_hw_common_caps *caps = &hw->func_caps.common_cap; - - vsi->rss_size = min_t(int, req_rss_size, - BIT(caps->rss_table_entry_width)); - } + else + vsi->rss_size = ice_get_valid_rss_size(hw, req_rss_size); /* create/set RSS LUT */ ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size); @@ -3441,9 +3449,12 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) ice_vsi_recfg_qs(vsi, new_rx, new_tx); - if (new_rx && !netif_is_rxfh_configured(dev)) + if (!netif_is_rxfh_configured(dev)) return ice_vsi_set_dflt_rss_lut(vsi, new_rx); + /* Update rss_size due to change in Rx queues */ + vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx); + return 0; } From 7b3d19a76d6824e5c0455566b348e1a5267383af Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 18 Feb 2021 15:48:50 -0700 Subject: [PATCH 238/506] drm/amd/pm/swsmu: Avoid using structure_size uninitialized in smu_cmn_init_soft_gpu_metrics Clang warns: drivers/gpu/drm/amd/amdgpu/../pm/swsmu/smu_cmn.c:764:2: warning: variable 'structure_size' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized] default: ^~~~~~~ drivers/gpu/drm/amd/amdgpu/../pm/swsmu/smu_cmn.c:770:23: note: uninitialized use occurs here memset(header, 0xFF, structure_size); ^~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../pm/swsmu/smu_cmn.c:753:25: note: initialize the variable 'structure_size' to silence this warning uint16_t structure_size; ^ = 0 1 warning generated. Return in the default case, as the size of the header will not be known. Fixes: de4b7cd8cb87 ("drm/amd/pm/swsmu: unify the init soft gpu metrics function") Link: https://github.com/ClangBuiltLinux/linux/issues/1304 Reviewed-by: Kevin Wang Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index bb620fdd4cd2..bcedd4d92e35 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -762,7 +762,7 @@ void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev) structure_size = sizeof(struct gpu_metrics_v2_0); break; default: - break; + return; } #undef METRICS_VERSION From efc8278eecfd5e6fa36c5d41e71d038f534fe107 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Thu, 18 Feb 2021 19:42:57 -0500 Subject: [PATCH 239/506] Revert "drm/amd/display: reuse current context instead of recreating one" This reverts commit 8866a67ab86cc0812e65c04f1ef02bcc41e24d68. Reason for revert: This breaks hotplug of HDMI on some systems, resulting in a blank screen. Caused general hangs on boot/hotplugs. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1487 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1492 Bug: https://bugzilla.kernel.org/show_bug.cgi?id=211649 Signed-off-by: Anson Jacob Reviewed-by: Bhawanpreet Lakha Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 23 ++++++++------- drivers/gpu/drm/amd/display/dc/core/dc.c | 29 ++++++------------- drivers/gpu/drm/amd/display/dc/dc_stream.h | 3 +- 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 94cd5ddd67ef..344404c4ac75 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1936,7 +1936,7 @@ static void dm_gpureset_commit_state(struct dc_state *dc_state, dc_commit_updates_for_stream( dm->dc, bundle->surface_updates, dc_state->stream_status->plane_count, - dc_state->streams[k], &bundle->stream_update); + dc_state->streams[k], &bundle->stream_update, dc_state); } cleanup: @@ -1967,7 +1967,8 @@ static void dm_set_dpms_off(struct dc_link *link) stream_update.stream = stream_state; dc_commit_updates_for_stream(stream_state->ctx->dc, NULL, 0, - stream_state, &stream_update); + stream_state, &stream_update, + stream_state->ctx->dc->current_state); mutex_unlock(&adev->dm.dc_lock); } @@ -7663,7 +7664,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, struct drm_crtc *pcrtc, bool wait_for_vblank) { - int i; + uint32_t i; uint64_t timestamp_ns; struct drm_plane *plane; struct drm_plane_state *old_plane_state, *new_plane_state; @@ -7704,7 +7705,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, amdgpu_dm_commit_cursors(state); /* update planes when needed */ - for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, new_plane_state, i) { + for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) { struct drm_crtc *crtc = new_plane_state->crtc; struct drm_crtc_state *new_crtc_state; struct drm_framebuffer *fb = new_plane_state->fb; @@ -7927,7 +7928,8 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, bundle->surface_updates, planes_count, acrtc_state->stream, - &bundle->stream_update); + &bundle->stream_update, + dc_state); /** * Enable or disable the interrupts on the backend. @@ -8263,13 +8265,13 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) struct dm_connector_state *dm_new_con_state = to_dm_connector_state(new_con_state); struct dm_connector_state *dm_old_con_state = to_dm_connector_state(old_con_state); struct amdgpu_crtc *acrtc = to_amdgpu_crtc(dm_new_con_state->base.crtc); - struct dc_surface_update surface_updates[MAX_SURFACES]; + struct dc_surface_update dummy_updates[MAX_SURFACES]; struct dc_stream_update stream_update; struct dc_info_packet hdr_packet; struct dc_stream_status *status = NULL; bool abm_changed, hdr_changed, scaling_changed; - memset(&surface_updates, 0, sizeof(surface_updates)); + memset(&dummy_updates, 0, sizeof(dummy_updates)); memset(&stream_update, 0, sizeof(stream_update)); if (acrtc) { @@ -8326,15 +8328,16 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) * To fix this, DC should permit updating only stream properties. */ for (j = 0; j < status->plane_count; j++) - surface_updates[j].surface = status->plane_states[j]; + dummy_updates[j].surface = status->plane_states[0]; mutex_lock(&dm->dc_lock); dc_commit_updates_for_stream(dm->dc, - surface_updates, + dummy_updates, status->plane_count, dm_new_crtc_state->stream, - &stream_update); + &stream_update, + dc_state); mutex_unlock(&dm->dc_lock); } diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index c9aede2f783d..8f8a13c7cf73 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2697,7 +2697,8 @@ void dc_commit_updates_for_stream(struct dc *dc, struct dc_surface_update *srf_updates, int surface_count, struct dc_stream_state *stream, - struct dc_stream_update *stream_update) + struct dc_stream_update *stream_update, + struct dc_state *state) { const struct dc_stream_status *stream_status; enum surface_update_type update_type; @@ -2716,12 +2717,6 @@ void dc_commit_updates_for_stream(struct dc *dc, if (update_type >= UPDATE_TYPE_FULL) { - struct dc_plane_state *new_planes[MAX_SURFACES]; - - memset(new_planes, 0, sizeof(new_planes)); - - for (i = 0; i < surface_count; i++) - new_planes[i] = srf_updates[i].surface; /* initialize scratch memory for building context */ context = dc_create_state(dc); @@ -2730,21 +2725,15 @@ void dc_commit_updates_for_stream(struct dc *dc, return; } - dc_resource_state_copy_construct( - dc->current_state, context); + dc_resource_state_copy_construct(state, context); - /*remove old surfaces from context */ - if (!dc_rem_all_planes_for_stream(dc, stream, context)) { - DC_ERROR("Failed to remove streams for new validate context!\n"); - return; + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *new_pipe = &context->res_ctx.pipe_ctx[i]; + struct pipe_ctx *old_pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + + if (new_pipe->plane_state && new_pipe->plane_state != old_pipe->plane_state) + new_pipe->plane_state->force_full_update = true; } - - /* add surface to context */ - if (!dc_add_all_planes_for_stream(dc, stream, new_planes, surface_count, context)) { - DC_ERROR("Failed to add streams for new validate context!\n"); - return; - } - } diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h index a4f7ec888c67..80b67b860091 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_stream.h +++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h @@ -294,7 +294,8 @@ void dc_commit_updates_for_stream(struct dc *dc, struct dc_surface_update *srf_updates, int surface_count, struct dc_stream_state *stream, - struct dc_stream_update *stream_update); + struct dc_stream_update *stream_update, + struct dc_state *state); /* * Log the current stream state. */ From 688f97ed3f5e339c0c2c09d9ee7ff23d5807b0a7 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Fri, 5 Feb 2021 14:15:11 -0500 Subject: [PATCH 240/506] drm/amd/display: Add vupdate_no_lock interrupts for DCN2.1 When run igt@kms_vrr in a device that uses DCN2.1 architecture, we noticed multiple failures. Furthermore, when we tested a VRR demo, we noticed a system hang where the mouse pointer still works, but the entire system freezes; in this case, we don't see any dmesg warning or failure messages kernel. This happens due to a lack of vupdate_no_lock interrupt, making the userspace wait eternally to get the event back. For fixing this issue, we need to add the vupdate_no_lock interrupt in the interrupt list. Signed-off-by: Rodrigo Siqueira Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../display/dc/irq/dcn21/irq_service_dcn21.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c index 1b971265418b..0e0f494fbb5e 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c @@ -168,6 +168,11 @@ static const struct irq_source_info_funcs vblank_irq_info_funcs = { .ack = NULL }; +static const struct irq_source_info_funcs vupdate_no_lock_irq_info_funcs = { + .set = NULL, + .ack = NULL +}; + #undef BASE_INNER #define BASE_INNER(seg) DMU_BASE__INST0_SEG ## seg @@ -230,6 +235,17 @@ static const struct irq_source_info_funcs vblank_irq_info_funcs = { .funcs = &vblank_irq_info_funcs\ } +/* vupdate_no_lock_int_entry maps to DC_IRQ_SOURCE_VUPDATEx, to match semantic + * of DCE's DC_IRQ_SOURCE_VUPDATEx. + */ +#define vupdate_no_lock_int_entry(reg_num)\ + [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ + IRQ_REG_ENTRY(OTG, reg_num,\ + OTG_GLOBAL_SYNC_STATUS, VUPDATE_NO_LOCK_INT_EN,\ + OTG_GLOBAL_SYNC_STATUS, VUPDATE_NO_LOCK_EVENT_CLEAR),\ + .funcs = &vupdate_no_lock_irq_info_funcs\ + } + #define vblank_int_entry(reg_num)\ [DC_IRQ_SOURCE_VBLANK1 + reg_num] = {\ IRQ_REG_ENTRY(OTG, reg_num,\ @@ -338,6 +354,12 @@ irq_source_info_dcn21[DAL_IRQ_SOURCES_NUMBER] = { vupdate_int_entry(3), vupdate_int_entry(4), vupdate_int_entry(5), + vupdate_no_lock_int_entry(0), + vupdate_no_lock_int_entry(1), + vupdate_no_lock_int_entry(2), + vupdate_no_lock_int_entry(3), + vupdate_no_lock_int_entry(4), + vupdate_no_lock_int_entry(5), vblank_int_entry(0), vblank_int_entry(1), vblank_int_entry(2), From 83e6667b675f101fb66659dfa72e45d08773d763 Mon Sep 17 00:00:00 2001 From: Eric Bernstein Date: Fri, 5 Feb 2021 13:53:31 -0500 Subject: [PATCH 241/506] drm/amd/display: Remove Assert from dcn10_get_dig_frontend [Why] In some cases, this function is called when DIG BE is not connected to DIG FE, in which case a value of zero isn't invalid and assert should not be hit. [How] Remove assert and handle ENGINE_ID_UNKNOWN result in calling function. Signed-off-by: Eric Bernstein Acked-by: Bindu Ramamurthy Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.c | 1 - drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.c index 59024653430c..e4701825b5a0 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.c @@ -480,7 +480,6 @@ unsigned int dcn10_get_dig_frontend(struct link_encoder *enc) break; default: // invalid source select DIG - ASSERT(false); result = ENGINE_ID_UNKNOWN; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c index 9620fb8a27dc..06dc1e2e8383 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c @@ -539,6 +539,8 @@ void dcn30_init_hw(struct dc *dc) fe = dc->links[i]->link_enc->funcs->get_dig_frontend( dc->links[i]->link_enc); + if (fe == ENGINE_ID_UNKNOWN) + continue; for (j = 0; j < dc->res_pool->stream_enc_count; j++) { if (fe == dc->res_pool->stream_enc[j]->id) { From 04b385f325080157ab1b5f8ce1b1de07ce0d9e27 Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Thu, 18 Feb 2021 11:45:14 +0800 Subject: [PATCH 242/506] net: ag71xx: remove unnecessary MTU reservation 2 bytes of the MTU are reserved for Atheros DSA tag, but DSA core has already handled that since commit dc0fe7d47f9f. Remove the unnecessary reservation. Fixes: d51b6ce441d3 ("net: ethernet: add ag71xx driver") Signed-off-by: DENG Qingfang Reviewed-by: Oleksij Rempel Link: https://lore.kernel.org/r/20210218034514.3421-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/atheros/ag71xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index dd5c8a9038bb..a60ce9030581 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -223,8 +223,6 @@ #define AG71XX_REG_RX_SM 0x01b0 #define AG71XX_REG_TX_SM 0x01b4 -#define ETH_SWITCH_HEADER_LEN 2 - #define AG71XX_DEFAULT_MSG_ENABLE \ (NETIF_MSG_DRV \ | NETIF_MSG_PROBE \ @@ -933,7 +931,7 @@ static void ag71xx_hw_setup(struct ag71xx *ag) static unsigned int ag71xx_max_frame_len(unsigned int mtu) { - return ETH_SWITCH_HEADER_LEN + ETH_HLEN + VLAN_HLEN + mtu + ETH_FCS_LEN; + return ETH_HLEN + VLAN_HLEN + mtu + ETH_FCS_LEN; } static void ag71xx_hw_set_macaddr(struct ag71xx *ag, unsigned char *mac) From 4bb875632ad0e8b71fa0f0be292793c061a9f43c Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Feb 2021 21:26:43 -0800 Subject: [PATCH 243/506] RISC-V: Add a non-void return for sbi v02 functions SBI v0.2 functions can return an error code from SBI implementation. We are already processing the SBI error code and coverts it to the Linux error code. Propagate to the error code to the caller as well. As of now, kvm is the only user of these error codes. Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 10 +++++----- arch/riscv/kernel/sbi.c | 32 ++++++++++++++++---------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 99dc77ba8e43..99895d9c3bdd 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -100,13 +100,13 @@ int sbi_console_getchar(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); -void sbi_send_ipi(const unsigned long *hart_mask); -void sbi_remote_fence_i(const unsigned long *hart_mask); -void sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_send_ipi(const unsigned long *hart_mask); +int sbi_remote_fence_i(const unsigned long *hart_mask); +int sbi_remote_sfence_vma(const unsigned long *hart_mask, unsigned long start, unsigned long size); -void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long start, unsigned long size, unsigned long asid); @@ -147,7 +147,7 @@ static inline unsigned long sbi_minor_version(void) int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ -static inline void sbi_remote_fence_i(const unsigned long *hart_mask) {} +static inline int sbi_remote_fence_i(const unsigned long *hart_mask) { return -1; } static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index bae6ffdc6e7b..f4a7db3d309e 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -351,7 +351,7 @@ static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask, * sbi_set_timer() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. * - * Return: None + * Return: None. */ void sbi_set_timer(uint64_t stime_value) { @@ -362,11 +362,11 @@ void sbi_set_timer(uint64_t stime_value) * sbi_send_ipi() - Send an IPI to any hart. * @hart_mask: A cpu mask containing all the target harts. * - * Return: None + * Return: 0 on success, appropriate linux error code otherwise. */ -void sbi_send_ipi(const unsigned long *hart_mask) +int sbi_send_ipi(const unsigned long *hart_mask) { - __sbi_send_ipi(hart_mask); + return __sbi_send_ipi(hart_mask); } EXPORT_SYMBOL(sbi_send_ipi); @@ -374,12 +374,12 @@ EXPORT_SYMBOL(sbi_send_ipi); * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts. * @hart_mask: A cpu mask containing all the target harts. * - * Return: None + * Return: 0 on success, appropriate linux error code otherwise. */ -void sbi_remote_fence_i(const unsigned long *hart_mask) +int sbi_remote_fence_i(const unsigned long *hart_mask) { - __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, - hart_mask, 0, 0, 0, 0); + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, + hart_mask, 0, 0, 0, 0); } EXPORT_SYMBOL(sbi_remote_fence_i); @@ -390,14 +390,14 @@ EXPORT_SYMBOL(sbi_remote_fence_i); * @start: Start of the virtual address * @size: Total size of the virtual address range. * - * Return: None + * Return: 0 on success, appropriate linux error code otherwise. */ -void sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_remote_sfence_vma(const unsigned long *hart_mask, unsigned long start, unsigned long size) { - __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, - hart_mask, start, size, 0, 0); + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, + hart_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma); @@ -410,15 +410,15 @@ EXPORT_SYMBOL(sbi_remote_sfence_vma); * @size: Total size of the virtual address range. * @asid: The value of address space identifier (ASID). * - * Return: None + * Return: 0 on success, appropriate linux error code otherwise. */ -void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long start, unsigned long size, unsigned long asid) { - __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - hart_mask, start, size, asid, 0); + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, + hart_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); From c6ca7616f7d5c2ce166280107ba74db1d528fcb7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:14 +0900 Subject: [PATCH 244/506] clk: Add RISC-V Canaan Kendryte K210 clock driver Add a clock provider driver for the Canaan Kendryte K210 RISC-V SoC. This new driver with the compatible string "canaan,k210-clk" implements support for the full clock structure of the K210 SoC. Since it is required for the correct operation of the SoC, this driver is selected by default for compilation when the SOC_CANAAN option is selected. With this change, the k210-sysctl driver is turned into a simple platform driver which enables its power bus clock and triggers populating its child nodes. The sysctl driver retains the SOC early initialization code, but the implementation now relies on the new function k210_clk_early_init() provided by the new clk-k210 driver. The clock structure implemented and many of the coding ideas for the driver come from the work by Sean Anderson on the K210 support for the U-Boot project. Cc: Stephen Boyd Cc: Michael Turquette Cc: linux-clk@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Stephen Boyd Signed-off-by: Palmer Dabbelt --- drivers/clk/Kconfig | 7 + drivers/clk/Makefile | 1 + drivers/clk/clk-k210.c | 1007 ++++++++++++++++++++++++++++++ drivers/soc/canaan/Kconfig | 18 +- drivers/soc/canaan/Makefile | 2 +- drivers/soc/canaan/k210-sysctl.c | 199 ++---- include/soc/canaan/k210-sysctl.h | 2 + 7 files changed, 1061 insertions(+), 175 deletions(-) create mode 100644 drivers/clk/clk-k210.c diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 85856cff506c..78f6288d5226 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -368,6 +368,13 @@ config COMMON_CLK_FIXED_MMIO help Support for Memory Mapped IO Fixed clocks +config COMMON_CLK_K210 + bool "Clock driver for the Canaan Kendryte K210 SoC" + depends on OF && RISCV && SOC_CANAAN + default SOC_CANAAN + help + Support for the Canaan Kendryte K210 RISC-V SoC clocks. + source "drivers/clk/actions/Kconfig" source "drivers/clk/analogbits/Kconfig" source "drivers/clk/baikal-t1/Kconfig" diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile index dbdc590e7de3..5f8f9f135df5 100644 --- a/drivers/clk/Makefile +++ b/drivers/clk/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_COMMON_CLK_ASPEED) += clk-aspeed.o obj-$(CONFIG_MACH_ASPEED_G6) += clk-ast2600.o obj-$(CONFIG_ARCH_HIGHBANK) += clk-highbank.o obj-$(CONFIG_CLK_HSDK) += clk-hsdk-pll.o +obj-$(CONFIG_COMMON_CLK_K210) += clk-k210.o obj-$(CONFIG_COMMON_CLK_LOCHNAGAR) += clk-lochnagar.o obj-$(CONFIG_COMMON_CLK_MAX77686) += clk-max77686.o obj-$(CONFIG_COMMON_CLK_MAX9485) += clk-max9485.o diff --git a/drivers/clk/clk-k210.c b/drivers/clk/clk-k210.c new file mode 100644 index 000000000000..6c84abf5b2e3 --- /dev/null +++ b/drivers/clk/clk-k210.c @@ -0,0 +1,1007 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (c) 2019 Western Digital Corporation or its affiliates. + */ +#define pr_fmt(fmt) "k210-clk: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct k210_sysclk; + +struct k210_clk { + int id; + struct k210_sysclk *ksc; + struct clk_hw hw; +}; + +struct k210_clk_cfg { + const char *name; + u8 gate_reg; + u8 gate_bit; + u8 div_reg; + u8 div_shift; + u8 div_width; + u8 div_type; + u8 mux_reg; + u8 mux_bit; +}; + +enum k210_clk_div_type { + K210_DIV_NONE, + K210_DIV_ONE_BASED, + K210_DIV_DOUBLE_ONE_BASED, + K210_DIV_POWER_OF_TWO, +}; + +#define K210_GATE(_reg, _bit) \ + .gate_reg = (_reg), \ + .gate_bit = (_bit) + +#define K210_DIV(_reg, _shift, _width, _type) \ + .div_reg = (_reg), \ + .div_shift = (_shift), \ + .div_width = (_width), \ + .div_type = (_type) + +#define K210_MUX(_reg, _bit) \ + .mux_reg = (_reg), \ + .mux_bit = (_bit) + +static struct k210_clk_cfg k210_clk_cfgs[K210_NUM_CLKS] = { + /* Gated clocks, no mux, no divider */ + [K210_CLK_CPU] = { + .name = "cpu", + K210_GATE(K210_SYSCTL_EN_CENT, 0) + }, + [K210_CLK_DMA] = { + .name = "dma", + K210_GATE(K210_SYSCTL_EN_PERI, 1) + }, + [K210_CLK_FFT] = { + .name = "fft", + K210_GATE(K210_SYSCTL_EN_PERI, 4) + }, + [K210_CLK_GPIO] = { + .name = "gpio", + K210_GATE(K210_SYSCTL_EN_PERI, 5) + }, + [K210_CLK_UART1] = { + .name = "uart1", + K210_GATE(K210_SYSCTL_EN_PERI, 16) + }, + [K210_CLK_UART2] = { + .name = "uart2", + K210_GATE(K210_SYSCTL_EN_PERI, 17) + }, + [K210_CLK_UART3] = { + .name = "uart3", + K210_GATE(K210_SYSCTL_EN_PERI, 18) + }, + [K210_CLK_FPIOA] = { + .name = "fpioa", + K210_GATE(K210_SYSCTL_EN_PERI, 20) + }, + [K210_CLK_SHA] = { + .name = "sha", + K210_GATE(K210_SYSCTL_EN_PERI, 26) + }, + [K210_CLK_AES] = { + .name = "aes", + K210_GATE(K210_SYSCTL_EN_PERI, 19) + }, + [K210_CLK_OTP] = { + .name = "otp", + K210_GATE(K210_SYSCTL_EN_PERI, 27) + }, + [K210_CLK_RTC] = { + .name = "rtc", + K210_GATE(K210_SYSCTL_EN_PERI, 29) + }, + + /* Gated divider clocks */ + [K210_CLK_SRAM0] = { + .name = "sram0", + K210_GATE(K210_SYSCTL_EN_CENT, 1), + K210_DIV(K210_SYSCTL_THR0, 0, 4, K210_DIV_ONE_BASED) + }, + [K210_CLK_SRAM1] = { + .name = "sram1", + K210_GATE(K210_SYSCTL_EN_CENT, 2), + K210_DIV(K210_SYSCTL_THR0, 4, 4, K210_DIV_ONE_BASED) + }, + [K210_CLK_ROM] = { + .name = "rom", + K210_GATE(K210_SYSCTL_EN_PERI, 0), + K210_DIV(K210_SYSCTL_THR0, 16, 4, K210_DIV_ONE_BASED) + }, + [K210_CLK_DVP] = { + .name = "dvp", + K210_GATE(K210_SYSCTL_EN_PERI, 3), + K210_DIV(K210_SYSCTL_THR0, 12, 4, K210_DIV_ONE_BASED) + }, + [K210_CLK_APB0] = { + .name = "apb0", + K210_GATE(K210_SYSCTL_EN_CENT, 3), + K210_DIV(K210_SYSCTL_SEL0, 3, 3, K210_DIV_ONE_BASED) + }, + [K210_CLK_APB1] = { + .name = "apb1", + K210_GATE(K210_SYSCTL_EN_CENT, 4), + K210_DIV(K210_SYSCTL_SEL0, 6, 3, K210_DIV_ONE_BASED) + }, + [K210_CLK_APB2] = { + .name = "apb2", + K210_GATE(K210_SYSCTL_EN_CENT, 5), + K210_DIV(K210_SYSCTL_SEL0, 9, 3, K210_DIV_ONE_BASED) + }, + [K210_CLK_AI] = { + .name = "ai", + K210_GATE(K210_SYSCTL_EN_PERI, 2), + K210_DIV(K210_SYSCTL_THR0, 8, 4, K210_DIV_ONE_BASED) + }, + [K210_CLK_SPI0] = { + .name = "spi0", + K210_GATE(K210_SYSCTL_EN_PERI, 6), + K210_DIV(K210_SYSCTL_THR1, 0, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_SPI1] = { + .name = "spi1", + K210_GATE(K210_SYSCTL_EN_PERI, 7), + K210_DIV(K210_SYSCTL_THR1, 8, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_SPI2] = { + .name = "spi2", + K210_GATE(K210_SYSCTL_EN_PERI, 8), + K210_DIV(K210_SYSCTL_THR1, 16, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2C0] = { + .name = "i2c0", + K210_GATE(K210_SYSCTL_EN_PERI, 13), + K210_DIV(K210_SYSCTL_THR5, 8, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2C1] = { + .name = "i2c1", + K210_GATE(K210_SYSCTL_EN_PERI, 14), + K210_DIV(K210_SYSCTL_THR5, 16, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2C2] = { + .name = "i2c2", + K210_GATE(K210_SYSCTL_EN_PERI, 15), + K210_DIV(K210_SYSCTL_THR5, 24, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_WDT0] = { + .name = "wdt0", + K210_GATE(K210_SYSCTL_EN_PERI, 24), + K210_DIV(K210_SYSCTL_THR6, 0, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_WDT1] = { + .name = "wdt1", + K210_GATE(K210_SYSCTL_EN_PERI, 25), + K210_DIV(K210_SYSCTL_THR6, 8, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2S0] = { + .name = "i2s0", + K210_GATE(K210_SYSCTL_EN_PERI, 10), + K210_DIV(K210_SYSCTL_THR3, 0, 16, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2S1] = { + .name = "i2s1", + K210_GATE(K210_SYSCTL_EN_PERI, 11), + K210_DIV(K210_SYSCTL_THR3, 16, 16, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2S2] = { + .name = "i2s2", + K210_GATE(K210_SYSCTL_EN_PERI, 12), + K210_DIV(K210_SYSCTL_THR4, 0, 16, K210_DIV_DOUBLE_ONE_BASED) + }, + + /* Divider clocks, no gate, no mux */ + [K210_CLK_I2S0_M] = { + .name = "i2s0_m", + K210_DIV(K210_SYSCTL_THR4, 16, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2S1_M] = { + .name = "i2s1_m", + K210_DIV(K210_SYSCTL_THR4, 24, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + [K210_CLK_I2S2_M] = { + .name = "i2s2_m", + K210_DIV(K210_SYSCTL_THR4, 0, 8, K210_DIV_DOUBLE_ONE_BASED) + }, + + /* Muxed gated divider clocks */ + [K210_CLK_SPI3] = { + .name = "spi3", + K210_GATE(K210_SYSCTL_EN_PERI, 9), + K210_DIV(K210_SYSCTL_THR1, 24, 8, K210_DIV_DOUBLE_ONE_BASED), + K210_MUX(K210_SYSCTL_SEL0, 12) + }, + [K210_CLK_TIMER0] = { + .name = "timer0", + K210_GATE(K210_SYSCTL_EN_PERI, 21), + K210_DIV(K210_SYSCTL_THR2, 0, 8, K210_DIV_DOUBLE_ONE_BASED), + K210_MUX(K210_SYSCTL_SEL0, 13) + }, + [K210_CLK_TIMER1] = { + .name = "timer1", + K210_GATE(K210_SYSCTL_EN_PERI, 22), + K210_DIV(K210_SYSCTL_THR2, 8, 8, K210_DIV_DOUBLE_ONE_BASED), + K210_MUX(K210_SYSCTL_SEL0, 14) + }, + [K210_CLK_TIMER2] = { + .name = "timer2", + K210_GATE(K210_SYSCTL_EN_PERI, 23), + K210_DIV(K210_SYSCTL_THR2, 16, 8, K210_DIV_DOUBLE_ONE_BASED), + K210_MUX(K210_SYSCTL_SEL0, 15) + }, +}; + +/* + * PLL control register bits. + */ +#define K210_PLL_CLKR GENMASK(3, 0) +#define K210_PLL_CLKF GENMASK(9, 4) +#define K210_PLL_CLKOD GENMASK(13, 10) +#define K210_PLL_BWADJ GENMASK(19, 14) +#define K210_PLL_RESET (1 << 20) +#define K210_PLL_PWRD (1 << 21) +#define K210_PLL_INTFB (1 << 22) +#define K210_PLL_BYPASS (1 << 23) +#define K210_PLL_TEST (1 << 24) +#define K210_PLL_EN (1 << 25) +#define K210_PLL_SEL GENMASK(27, 26) /* PLL2 only */ + +/* + * PLL lock register bits. + */ +#define K210_PLL_LOCK 0 +#define K210_PLL_CLEAR_SLIP 2 +#define K210_PLL_TEST_OUT 3 + +/* + * Clock selector register bits. + */ +#define K210_ACLK_SEL BIT(0) +#define K210_ACLK_DIV GENMASK(2, 1) + +/* + * PLLs. + */ +enum k210_pll_id { + K210_PLL0, K210_PLL1, K210_PLL2, K210_PLL_NUM +}; + +struct k210_pll { + enum k210_pll_id id; + struct k210_sysclk *ksc; + void __iomem *base; + void __iomem *reg; + void __iomem *lock; + u8 lock_shift; + u8 lock_width; + struct clk_hw hw; +}; +#define to_k210_pll(_hw) container_of(_hw, struct k210_pll, hw) + +/* + * PLLs configuration: by default PLL0 runs at 780 MHz and PLL1 at 299 MHz. + * The first 2 SRAM banks depend on ACLK/CPU clock which is by default PLL0 + * rate divided by 2. Set PLL1 to 390 MHz so that the third SRAM bank has the + * same clock as the first 2. + */ +struct k210_pll_cfg { + u32 reg; + u8 lock_shift; + u8 lock_width; + u32 r; + u32 f; + u32 od; + u32 bwadj; +}; + +static struct k210_pll_cfg k210_plls_cfg[] = { + { K210_SYSCTL_PLL0, 0, 2, 0, 59, 1, 59 }, /* 780 MHz */ + { K210_SYSCTL_PLL1, 8, 1, 0, 59, 3, 59 }, /* 390 MHz */ + { K210_SYSCTL_PLL2, 16, 1, 0, 22, 1, 22 }, /* 299 MHz */ +}; + +/** + * struct k210_sysclk - sysclk driver data + * @regs: system controller registers start address + * @clk_lock: clock setting spinlock + * @plls: SoC PLLs descriptors + * @aclk: ACLK clock + * @clks: All other clocks + */ +struct k210_sysclk { + void __iomem *regs; + spinlock_t clk_lock; + struct k210_pll plls[K210_PLL_NUM]; + struct clk_hw aclk; + struct k210_clk clks[K210_NUM_CLKS]; +}; + +#define to_k210_sysclk(_hw) container_of(_hw, struct k210_sysclk, aclk) + +/* + * Set ACLK parent selector: 0 for IN0, 1 for PLL0. + */ +static void k210_aclk_set_selector(void __iomem *regs, u8 sel) +{ + u32 reg = readl(regs + K210_SYSCTL_SEL0); + + if (sel) + reg |= K210_ACLK_SEL; + else + reg &= K210_ACLK_SEL; + writel(reg, regs + K210_SYSCTL_SEL0); +} + +static void k210_init_pll(void __iomem *regs, enum k210_pll_id pllid, + struct k210_pll *pll) +{ + pll->id = pllid; + pll->reg = regs + k210_plls_cfg[pllid].reg; + pll->lock = regs + K210_SYSCTL_PLL_LOCK; + pll->lock_shift = k210_plls_cfg[pllid].lock_shift; + pll->lock_width = k210_plls_cfg[pllid].lock_width; +} + +static void k210_pll_wait_for_lock(struct k210_pll *pll) +{ + u32 reg, mask = GENMASK(pll->lock_shift + pll->lock_width - 1, + pll->lock_shift); + + while (true) { + reg = readl(pll->lock); + if ((reg & mask) == mask) + break; + + reg |= BIT(pll->lock_shift + K210_PLL_CLEAR_SLIP); + writel(reg, pll->lock); + } +} + +static bool k210_pll_hw_is_enabled(struct k210_pll *pll) +{ + u32 reg = readl(pll->reg); + u32 mask = K210_PLL_PWRD | K210_PLL_EN; + + if (reg & K210_PLL_RESET) + return false; + + return (reg & mask) == mask; +} + +static void k210_pll_enable_hw(void __iomem *regs, struct k210_pll *pll) +{ + struct k210_pll_cfg *pll_cfg = &k210_plls_cfg[pll->id]; + u32 reg; + + if (k210_pll_hw_is_enabled(pll)) + return; + + /* + * For PLL0, we need to re-parent ACLK to IN0 to keep the CPU cores and + * SRAM running. + */ + if (pll->id == K210_PLL0) + k210_aclk_set_selector(regs, 0); + + /* Set PLL factors */ + reg = readl(pll->reg); + reg &= ~GENMASK(19, 0); + reg |= FIELD_PREP(K210_PLL_CLKR, pll_cfg->r); + reg |= FIELD_PREP(K210_PLL_CLKF, pll_cfg->f); + reg |= FIELD_PREP(K210_PLL_CLKOD, pll_cfg->od); + reg |= FIELD_PREP(K210_PLL_BWADJ, pll_cfg->bwadj); + reg |= K210_PLL_PWRD; + writel(reg, pll->reg); + + /* + * Reset the PLL: ensure reset is low before asserting it. + * The magic NOPs come from the Kendryte reference SDK. + */ + reg &= ~K210_PLL_RESET; + writel(reg, pll->reg); + reg |= K210_PLL_RESET; + writel(reg, pll->reg); + nop(); + nop(); + reg &= ~K210_PLL_RESET; + writel(reg, pll->reg); + + k210_pll_wait_for_lock(pll); + + reg &= ~K210_PLL_BYPASS; + reg |= K210_PLL_EN; + writel(reg, pll->reg); + + if (pll->id == K210_PLL0) + k210_aclk_set_selector(regs, 1); +} + +static int k210_pll_enable(struct clk_hw *hw) +{ + struct k210_pll *pll = to_k210_pll(hw); + struct k210_sysclk *ksc = pll->ksc; + unsigned long flags; + + spin_lock_irqsave(&ksc->clk_lock, flags); + + k210_pll_enable_hw(ksc->regs, pll); + + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return 0; +} + +static void k210_pll_disable(struct clk_hw *hw) +{ + struct k210_pll *pll = to_k210_pll(hw); + struct k210_sysclk *ksc = pll->ksc; + unsigned long flags; + u32 reg; + + /* + * Bypassing before powering off is important so child clocks do not + * stop working. This is especially important for pll0, the indirect + * parent of the cpu clock. + */ + spin_lock_irqsave(&ksc->clk_lock, flags); + reg = readl(pll->reg); + reg |= K210_PLL_BYPASS; + writel(reg, pll->reg); + + reg &= ~K210_PLL_PWRD; + reg &= ~K210_PLL_EN; + writel(reg, pll->reg); + spin_unlock_irqrestore(&ksc->clk_lock, flags); +} + +static int k210_pll_is_enabled(struct clk_hw *hw) +{ + return k210_pll_hw_is_enabled(to_k210_pll(hw)); +} + +static unsigned long k210_pll_get_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct k210_pll *pll = to_k210_pll(hw); + u32 reg = readl(pll->reg); + u32 r, f, od; + + if (reg & K210_PLL_BYPASS) + return parent_rate; + + if (!(reg & K210_PLL_PWRD)) + return 0; + + r = FIELD_GET(K210_PLL_CLKR, reg) + 1; + f = FIELD_GET(K210_PLL_CLKF, reg) + 1; + od = FIELD_GET(K210_PLL_CLKOD, reg) + 1; + + return (u64)parent_rate * f / (r * od); +} + +static const struct clk_ops k210_pll_ops = { + .enable = k210_pll_enable, + .disable = k210_pll_disable, + .is_enabled = k210_pll_is_enabled, + .recalc_rate = k210_pll_get_rate, +}; + +static int k210_pll2_set_parent(struct clk_hw *hw, u8 index) +{ + struct k210_pll *pll = to_k210_pll(hw); + struct k210_sysclk *ksc = pll->ksc; + unsigned long flags; + u32 reg; + + spin_lock_irqsave(&ksc->clk_lock, flags); + + reg = readl(pll->reg); + reg &= ~K210_PLL_SEL; + reg |= FIELD_PREP(K210_PLL_SEL, index); + writel(reg, pll->reg); + + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return 0; +} + +static u8 k210_pll2_get_parent(struct clk_hw *hw) +{ + struct k210_pll *pll = to_k210_pll(hw); + u32 reg = readl(pll->reg); + + return FIELD_GET(K210_PLL_SEL, reg); +} + +static const struct clk_ops k210_pll2_ops = { + .enable = k210_pll_enable, + .disable = k210_pll_disable, + .is_enabled = k210_pll_is_enabled, + .recalc_rate = k210_pll_get_rate, + .set_parent = k210_pll2_set_parent, + .get_parent = k210_pll2_get_parent, +}; + +static int __init k210_register_pll(struct device_node *np, + struct k210_sysclk *ksc, + enum k210_pll_id pllid, const char *name, + int num_parents, const struct clk_ops *ops) +{ + struct k210_pll *pll = &ksc->plls[pllid]; + struct clk_init_data init = {}; + const struct clk_parent_data parent_data[] = { + { /* .index = 0 for in0 */ }, + { .hw = &ksc->plls[K210_PLL0].hw }, + { .hw = &ksc->plls[K210_PLL1].hw }, + }; + + init.name = name; + init.parent_data = parent_data; + init.num_parents = num_parents; + init.ops = ops; + + pll->hw.init = &init; + pll->ksc = ksc; + + return of_clk_hw_register(np, &pll->hw); +} + +static int __init k210_register_plls(struct device_node *np, + struct k210_sysclk *ksc) +{ + int i, ret; + + for (i = 0; i < K210_PLL_NUM; i++) + k210_init_pll(ksc->regs, i, &ksc->plls[i]); + + /* PLL0 and PLL1 only have IN0 as parent */ + ret = k210_register_pll(np, ksc, K210_PLL0, "pll0", 1, &k210_pll_ops); + if (ret) { + pr_err("%pOFP: register PLL0 failed\n", np); + return ret; + } + ret = k210_register_pll(np, ksc, K210_PLL1, "pll1", 1, &k210_pll_ops); + if (ret) { + pr_err("%pOFP: register PLL1 failed\n", np); + return ret; + } + + /* PLL2 has IN0, PLL0 and PLL1 as parents */ + ret = k210_register_pll(np, ksc, K210_PLL2, "pll2", 3, &k210_pll2_ops); + if (ret) { + pr_err("%pOFP: register PLL2 failed\n", np); + return ret; + } + + return 0; +} + +static int k210_aclk_set_parent(struct clk_hw *hw, u8 index) +{ + struct k210_sysclk *ksc = to_k210_sysclk(hw); + unsigned long flags; + + spin_lock_irqsave(&ksc->clk_lock, flags); + + k210_aclk_set_selector(ksc->regs, index); + + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return 0; +} + +static u8 k210_aclk_get_parent(struct clk_hw *hw) +{ + struct k210_sysclk *ksc = to_k210_sysclk(hw); + u32 sel; + + sel = readl(ksc->regs + K210_SYSCTL_SEL0) & K210_ACLK_SEL; + + return sel ? 1 : 0; +} + +static unsigned long k210_aclk_get_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct k210_sysclk *ksc = to_k210_sysclk(hw); + u32 reg = readl(ksc->regs + K210_SYSCTL_SEL0); + unsigned int shift; + + if (!(reg & 0x1)) + return parent_rate; + + shift = FIELD_GET(K210_ACLK_DIV, reg); + + return parent_rate / (2UL << shift); +} + +static const struct clk_ops k210_aclk_ops = { + .set_parent = k210_aclk_set_parent, + .get_parent = k210_aclk_get_parent, + .recalc_rate = k210_aclk_get_rate, +}; + +/* + * ACLK has IN0 and PLL0 as parents. + */ +static int __init k210_register_aclk(struct device_node *np, + struct k210_sysclk *ksc) +{ + struct clk_init_data init = {}; + const struct clk_parent_data parent_data[] = { + { /* .index = 0 for in0 */ }, + { .hw = &ksc->plls[K210_PLL0].hw }, + }; + int ret; + + init.name = "aclk"; + init.parent_data = parent_data; + init.num_parents = 2; + init.ops = &k210_aclk_ops; + ksc->aclk.init = &init; + + ret = of_clk_hw_register(np, &ksc->aclk); + if (ret) { + pr_err("%pOFP: register aclk failed\n", np); + return ret; + } + + return 0; +} + +#define to_k210_clk(_hw) container_of(_hw, struct k210_clk, hw) + +static int k210_clk_enable(struct clk_hw *hw) +{ + struct k210_clk *kclk = to_k210_clk(hw); + struct k210_sysclk *ksc = kclk->ksc; + struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id]; + unsigned long flags; + u32 reg; + + if (!cfg->gate_reg) + return 0; + + spin_lock_irqsave(&ksc->clk_lock, flags); + reg = readl(ksc->regs + cfg->gate_reg); + reg |= BIT(cfg->gate_bit); + writel(reg, ksc->regs + cfg->gate_reg); + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return 0; +} + +static void k210_clk_disable(struct clk_hw *hw) +{ + struct k210_clk *kclk = to_k210_clk(hw); + struct k210_sysclk *ksc = kclk->ksc; + struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id]; + unsigned long flags; + u32 reg; + + if (!cfg->gate_reg) + return; + + spin_lock_irqsave(&ksc->clk_lock, flags); + reg = readl(ksc->regs + cfg->gate_reg); + reg &= ~BIT(cfg->gate_bit); + writel(reg, ksc->regs + cfg->gate_reg); + spin_unlock_irqrestore(&ksc->clk_lock, flags); +} + +static int k210_clk_set_parent(struct clk_hw *hw, u8 index) +{ + struct k210_clk *kclk = to_k210_clk(hw); + struct k210_sysclk *ksc = kclk->ksc; + struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id]; + unsigned long flags; + u32 reg; + + spin_lock_irqsave(&ksc->clk_lock, flags); + reg = readl(ksc->regs + cfg->mux_reg); + if (index) + reg |= BIT(cfg->mux_bit); + else + reg &= ~BIT(cfg->mux_bit); + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return 0; +} + +static u8 k210_clk_get_parent(struct clk_hw *hw) +{ + struct k210_clk *kclk = to_k210_clk(hw); + struct k210_sysclk *ksc = kclk->ksc; + struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id]; + unsigned long flags; + u32 reg, idx; + + spin_lock_irqsave(&ksc->clk_lock, flags); + reg = readl(ksc->regs + cfg->mux_reg); + idx = (reg & BIT(cfg->mux_bit)) ? 1 : 0; + spin_unlock_irqrestore(&ksc->clk_lock, flags); + + return idx; +} + +static unsigned long k210_clk_get_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct k210_clk *kclk = to_k210_clk(hw); + struct k210_sysclk *ksc = kclk->ksc; + struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id]; + u32 reg, div_val; + + if (!cfg->div_reg) + return parent_rate; + + reg = readl(ksc->regs + cfg->div_reg); + div_val = (reg >> cfg->div_shift) & GENMASK(cfg->div_width - 1, 0); + + switch (cfg->div_type) { + case K210_DIV_ONE_BASED: + return parent_rate / (div_val + 1); + case K210_DIV_DOUBLE_ONE_BASED: + return parent_rate / ((div_val + 1) * 2); + case K210_DIV_POWER_OF_TWO: + return parent_rate / (2UL << div_val); + case K210_DIV_NONE: + default: + return 0; + } +} + +static const struct clk_ops k210_clk_mux_ops = { + .enable = k210_clk_enable, + .disable = k210_clk_disable, + .set_parent = k210_clk_set_parent, + .get_parent = k210_clk_get_parent, + .recalc_rate = k210_clk_get_rate, +}; + +static const struct clk_ops k210_clk_ops = { + .enable = k210_clk_enable, + .disable = k210_clk_disable, + .recalc_rate = k210_clk_get_rate, +}; + +static void __init k210_register_clk(struct device_node *np, + struct k210_sysclk *ksc, int id, + const struct clk_parent_data *parent_data, + int num_parents, unsigned long flags) +{ + struct k210_clk *kclk = &ksc->clks[id]; + struct clk_init_data init = {}; + int ret; + + init.name = k210_clk_cfgs[id].name; + init.flags = flags; + init.parent_data = parent_data; + init.num_parents = num_parents; + if (num_parents > 1) + init.ops = &k210_clk_mux_ops; + else + init.ops = &k210_clk_ops; + + kclk->id = id; + kclk->ksc = ksc; + kclk->hw.init = &init; + + ret = of_clk_hw_register(np, &kclk->hw); + if (ret) { + pr_err("%pOFP: register clock %s failed\n", + np, k210_clk_cfgs[id].name); + kclk->id = -1; + } +} + +/* + * All muxed clocks have IN0 and PLL0 as parents. + */ +static inline void __init k210_register_mux_clk(struct device_node *np, + struct k210_sysclk *ksc, int id) +{ + const struct clk_parent_data parent_data[2] = { + { /* .index = 0 for in0 */ }, + { .hw = &ksc->plls[K210_PLL0].hw } + }; + + k210_register_clk(np, ksc, id, parent_data, 2, 0); +} + +static inline void __init k210_register_in0_child(struct device_node *np, + struct k210_sysclk *ksc, int id) +{ + const struct clk_parent_data parent_data = { + /* .index = 0 for in0 */ + }; + + k210_register_clk(np, ksc, id, &parent_data, 1, 0); +} + +static inline void __init k210_register_pll_child(struct device_node *np, + struct k210_sysclk *ksc, int id, + enum k210_pll_id pllid, + unsigned long flags) +{ + const struct clk_parent_data parent_data = { + .hw = &ksc->plls[pllid].hw, + }; + + k210_register_clk(np, ksc, id, &parent_data, 1, flags); +} + +static inline void __init k210_register_aclk_child(struct device_node *np, + struct k210_sysclk *ksc, int id, + unsigned long flags) +{ + const struct clk_parent_data parent_data = { + .hw = &ksc->aclk, + }; + + k210_register_clk(np, ksc, id, &parent_data, 1, flags); +} + +static inline void __init k210_register_clk_child(struct device_node *np, + struct k210_sysclk *ksc, int id, + int parent_id) +{ + const struct clk_parent_data parent_data = { + .hw = &ksc->clks[parent_id].hw, + }; + + k210_register_clk(np, ksc, id, &parent_data, 1, 0); +} + +static struct clk_hw *k210_clk_hw_onecell_get(struct of_phandle_args *clkspec, + void *data) +{ + struct k210_sysclk *ksc = data; + unsigned int idx = clkspec->args[0]; + + if (idx >= K210_NUM_CLKS) + return ERR_PTR(-EINVAL); + + return &ksc->clks[idx].hw; +} + +static void __init k210_clk_init(struct device_node *np) +{ + struct device_node *sysctl_np; + struct k210_sysclk *ksc; + int i, ret; + + ksc = kzalloc(sizeof(*ksc), GFP_KERNEL); + if (!ksc) + return; + + spin_lock_init(&ksc->clk_lock); + sysctl_np = of_get_parent(np); + ksc->regs = of_iomap(sysctl_np, 0); + of_node_put(sysctl_np); + if (!ksc->regs) { + pr_err("%pOFP: failed to map registers\n", np); + return; + } + + ret = k210_register_plls(np, ksc); + if (ret) + return; + + ret = k210_register_aclk(np, ksc); + if (ret) + return; + + /* + * Critical clocks: there are no consumers of the SRAM clocks, + * including the AI clock for the third SRAM bank. The CPU clock + * is only referenced by the uarths serial device and so would be + * disabled if the serial console is disabled to switch to another + * console. Mark all these clocks as critical so that they are never + * disabled by the core clock management. + */ + k210_register_aclk_child(np, ksc, K210_CLK_CPU, CLK_IS_CRITICAL); + k210_register_aclk_child(np, ksc, K210_CLK_SRAM0, CLK_IS_CRITICAL); + k210_register_aclk_child(np, ksc, K210_CLK_SRAM1, CLK_IS_CRITICAL); + k210_register_pll_child(np, ksc, K210_CLK_AI, K210_PLL1, + CLK_IS_CRITICAL); + + /* Clocks with aclk as source */ + k210_register_aclk_child(np, ksc, K210_CLK_DMA, 0); + k210_register_aclk_child(np, ksc, K210_CLK_FFT, 0); + k210_register_aclk_child(np, ksc, K210_CLK_ROM, 0); + k210_register_aclk_child(np, ksc, K210_CLK_DVP, 0); + k210_register_aclk_child(np, ksc, K210_CLK_APB0, 0); + k210_register_aclk_child(np, ksc, K210_CLK_APB1, 0); + k210_register_aclk_child(np, ksc, K210_CLK_APB2, 0); + + /* Clocks with PLL0 as source */ + k210_register_pll_child(np, ksc, K210_CLK_SPI0, K210_PLL0, 0); + k210_register_pll_child(np, ksc, K210_CLK_SPI1, K210_PLL0, 0); + k210_register_pll_child(np, ksc, K210_CLK_SPI2, K210_PLL0, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2C0, K210_PLL0, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2C1, K210_PLL0, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2C2, K210_PLL0, 0); + + /* Clocks with PLL2 as source */ + k210_register_pll_child(np, ksc, K210_CLK_I2S0, K210_PLL2, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2S1, K210_PLL2, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2S2, K210_PLL2, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2S0_M, K210_PLL2, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2S1_M, K210_PLL2, 0); + k210_register_pll_child(np, ksc, K210_CLK_I2S2_M, K210_PLL2, 0); + + /* Clocks with IN0 as source */ + k210_register_in0_child(np, ksc, K210_CLK_WDT0); + k210_register_in0_child(np, ksc, K210_CLK_WDT1); + k210_register_in0_child(np, ksc, K210_CLK_RTC); + + /* Clocks with APB0 as source */ + k210_register_clk_child(np, ksc, K210_CLK_GPIO, K210_CLK_APB0); + k210_register_clk_child(np, ksc, K210_CLK_UART1, K210_CLK_APB0); + k210_register_clk_child(np, ksc, K210_CLK_UART2, K210_CLK_APB0); + k210_register_clk_child(np, ksc, K210_CLK_UART3, K210_CLK_APB0); + k210_register_clk_child(np, ksc, K210_CLK_FPIOA, K210_CLK_APB0); + k210_register_clk_child(np, ksc, K210_CLK_SHA, K210_CLK_APB0); + + /* Clocks with APB1 as source */ + k210_register_clk_child(np, ksc, K210_CLK_AES, K210_CLK_APB1); + k210_register_clk_child(np, ksc, K210_CLK_OTP, K210_CLK_APB1); + + /* Mux clocks with in0 or pll0 as source */ + k210_register_mux_clk(np, ksc, K210_CLK_SPI3); + k210_register_mux_clk(np, ksc, K210_CLK_TIMER0); + k210_register_mux_clk(np, ksc, K210_CLK_TIMER1); + k210_register_mux_clk(np, ksc, K210_CLK_TIMER2); + + /* Check for registration errors */ + for (i = 0; i < K210_NUM_CLKS; i++) { + if (ksc->clks[i].id != i) + return; + } + + ret = of_clk_add_hw_provider(np, k210_clk_hw_onecell_get, ksc); + if (ret) { + pr_err("%pOFP: add clock provider failed %d\n", np, ret); + return; + } + + pr_info("%pOFP: CPU running at %lu MHz\n", + np, clk_hw_get_rate(&ksc->clks[K210_CLK_CPU].hw) / 1000000); +} + +CLK_OF_DECLARE(k210_clk, "canaan,k210-clk", k210_clk_init); + +/* + * Enable PLL1 to be able to use the AI SRAM. + */ +void __init k210_clk_early_init(void __iomem *regs) +{ + struct k210_pll pll1; + + /* Make sure ACLK selector is set to PLL0 */ + k210_aclk_set_selector(regs, 1); + + /* Startup PLL1 to enable the aisram bank for general memory use */ + k210_init_pll(regs, K210_PLL1, &pll1); + k210_pll_enable_hw(regs, &pll1); +} diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig index 5232d13f07e5..8179b69518b4 100644 --- a/drivers/soc/canaan/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -1,14 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -if SOC_CANAAN - -config K210_SYSCTL +config SOC_K210_SYSCTL bool "Canaan Kendryte K210 SoC system controller" - default y - depends on RISCV + depends on RISCV && SOC_CANAAN && OF + default SOC_CANAAN + select PM + select SIMPLE_PM_BUS + select SYSCON + select MFD_SYSCON help - Enables controlling the K210 various clocks and to enable - general purpose use of the extra 2MB of SRAM normally - reserved for the AI engine. - -endif + Canaan Kendryte K210 SoC system controller driver. diff --git a/drivers/soc/canaan/Makefile b/drivers/soc/canaan/Makefile index 002d9ce95c0d..570280ad7967 100644 --- a/drivers/soc/canaan/Makefile +++ b/drivers/soc/canaan/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_K210_SYSCTL) += k210-sysctl.o +obj-$(CONFIG_SOC_K210_SYSCTL) += k210-sysctl.o diff --git a/drivers/soc/canaan/k210-sysctl.c b/drivers/soc/canaan/k210-sysctl.c index 60b474c33d45..27a346c406bc 100644 --- a/drivers/soc/canaan/k210-sysctl.c +++ b/drivers/soc/canaan/k210-sysctl.c @@ -3,165 +3,45 @@ * Copyright (c) 2019 Christoph Hellwig. * Copyright (c) 2019 Western Digital Corporation or its affiliates. */ -#include #include -#include #include -#include -#include -#include +#include +#include #include #include -#define K210_SYSCTL_CLK0_FREQ 26000000UL - -/* Registers base address */ -#define K210_SYSCTL_SYSCTL_BASE_ADDR 0x50440000ULL - -/* Register bits */ -/* K210_SYSCTL_PLL1: clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */ -#define PLL_RESET (1 << 20) -#define PLL_PWR (1 << 21) -#define PLL_BYPASS (1 << 23) -#define PLL_OUT_EN (1 << 25) -/* K210_SYSCTL_PLL_LOCK */ -#define PLL1_LOCK1 (1 << 8) -#define PLL1_LOCK2 (1 << 9) -#define PLL1_SLIP_CLEAR (1 << 10) -/* K210_SYSCTL_SEL0 */ -#define CLKSEL_ACLK (1 << 0) -/* K210_SYSCTL_CLKEN_CENT */ -#define CLKEN_CPU (1 << 0) -#define CLKEN_SRAM0 (1 << 1) -#define CLKEN_SRAM1 (1 << 2) -/* K210_SYSCTL_EN_PERI */ -#define CLKEN_ROM (1 << 0) -#define CLKEN_TIMER0 (1 << 21) -#define CLKEN_RTC (1 << 29) - -struct k210_sysctl { - void __iomem *regs; - struct clk_hw hw; -}; - -static void k210_set_bits(u32 val, void __iomem *reg) -{ - writel(readl(reg) | val, reg); -} - -static void k210_clear_bits(u32 val, void __iomem *reg) -{ - writel(readl(reg) & ~val, reg); -} - -static void k210_pll1_enable(void __iomem *regs) -{ - u32 val; - - val = readl(regs + K210_SYSCTL_PLL1); - val &= ~GENMASK(19, 0); /* clkr1 = 0 */ - val |= FIELD_PREP(GENMASK(9, 4), 0x3B); /* clkf1 = 59 */ - val |= FIELD_PREP(GENMASK(13, 10), 0x3); /* clkod1 = 3 */ - val |= FIELD_PREP(GENMASK(19, 14), 0x3B); /* bwadj1 = 59 */ - writel(val, regs + K210_SYSCTL_PLL1); - - k210_clear_bits(PLL_BYPASS, regs + K210_SYSCTL_PLL1); - k210_set_bits(PLL_PWR, regs + K210_SYSCTL_PLL1); - - /* - * Reset the pll. The magic NOPs come from the Kendryte reference SDK. - */ - k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); - k210_set_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); - nop(); - nop(); - k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); - - for (;;) { - val = readl(regs + K210_SYSCTL_PLL_LOCK); - if (val & PLL1_LOCK2) - break; - writel(val | PLL1_SLIP_CLEAR, regs + K210_SYSCTL_PLL_LOCK); - } - - k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL1); -} - -static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw, - unsigned long parent_rate) -{ - struct k210_sysctl *s = container_of(hw, struct k210_sysctl, hw); - u32 clksel0, pll0; - u64 pll0_freq, clkr0, clkf0, clkod0; - - /* - * If the clock selector is not set, use the base frequency. - * Otherwise, use PLL0 frequency with a frequency divisor. - */ - clksel0 = readl(s->regs + K210_SYSCTL_SEL0); - if (!(clksel0 & CLKSEL_ACLK)) - return K210_SYSCTL_CLK0_FREQ; - - /* - * Get PLL0 frequency: - * freq = base frequency * clkf0 / (clkr0 * clkod0) - */ - pll0 = readl(s->regs + K210_SYSCTL_PLL0); - clkr0 = 1 + FIELD_GET(GENMASK(3, 0), pll0); - clkf0 = 1 + FIELD_GET(GENMASK(9, 4), pll0); - clkod0 = 1 + FIELD_GET(GENMASK(13, 10), pll0); - pll0_freq = clkf0 * K210_SYSCTL_CLK0_FREQ / (clkr0 * clkod0); - - /* Get the frequency divisor from the clock selector */ - return pll0_freq / (2ULL << FIELD_GET(0x00000006, clksel0)); -} - -static const struct clk_ops k210_sysctl_clk_ops = { - .recalc_rate = k210_sysctl_clk_recalc_rate, -}; - -static const struct clk_init_data k210_clk_init_data = { - .name = "k210-sysctl-pll1", - .ops = &k210_sysctl_clk_ops, -}; - static int k210_sysctl_probe(struct platform_device *pdev) { - struct k210_sysctl *s; - int error; + struct device *dev = &pdev->dev; + struct clk *pclk; + int ret; - pr_info("Kendryte K210 SoC sysctl\n"); + dev_info(dev, "K210 system controller\n"); - s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + /* Get power bus clock */ + pclk = devm_clk_get(dev, NULL); + if (IS_ERR(pclk)) + return dev_err_probe(dev, PTR_ERR(pclk), + "Get bus clock failed\n"); - s->regs = devm_ioremap_resource(&pdev->dev, - platform_get_resource(pdev, IORESOURCE_MEM, 0)); - if (IS_ERR(s->regs)) - return PTR_ERR(s->regs); - - s->hw.init = &k210_clk_init_data; - error = devm_clk_hw_register(&pdev->dev, &s->hw); - if (error) { - dev_err(&pdev->dev, "failed to register clk"); - return error; + ret = clk_prepare_enable(pclk); + if (ret) { + dev_err(dev, "Enable bus clock failed\n"); + return ret; } - error = devm_of_clk_add_hw_provider(&pdev->dev, of_clk_hw_simple_get, - &s->hw); - if (error) { - dev_err(&pdev->dev, "adding clk provider failed\n"); - return error; - } + /* Populate children */ + ret = devm_of_platform_populate(dev); + if (ret) + dev_err(dev, "Populate platform failed %d\n", ret); - return 0; + return ret; } static const struct of_device_id k210_sysctl_of_match[] = { - { .compatible = "kendryte,k210-sysctl", }, - {} + { .compatible = "canaan,k210-sysctl", }, + { /* sentinel */ }, }; static struct platform_driver k210_sysctl_driver = { @@ -171,12 +51,13 @@ static struct platform_driver k210_sysctl_driver = { }, .probe = k210_sysctl_probe, }; +builtin_platform_driver(k210_sysctl_driver); -static int __init k210_sysctl_init(void) -{ - return platform_driver_register(&k210_sysctl_driver); -} -core_initcall(k210_sysctl_init); +/* + * System controller registers base address and size. + */ +#define K210_SYSCTL_BASE_ADDR 0x50440000ULL +#define K210_SYSCTL_BASE_SIZE 0x1000 /* * This needs to be called very early during initialization, given that @@ -184,24 +65,14 @@ core_initcall(k210_sysctl_init); */ static void __init k210_soc_early_init(const void *fdt) { - void __iomem *regs; + void __iomem *sysctl_base; - regs = ioremap(K210_SYSCTL_SYSCTL_BASE_ADDR, 0x1000); - if (!regs) - panic("K210 sysctl ioremap"); + sysctl_base = ioremap(K210_SYSCTL_BASE_ADDR, K210_SYSCTL_BASE_SIZE); + if (!sysctl_base) + panic("k210-sysctl: ioremap failed"); - /* Enable PLL1 to make the KPU SRAM useable */ - k210_pll1_enable(regs); + k210_clk_early_init(sysctl_base); - k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0); - - k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1, - regs + K210_SYSCTL_EN_CENT); - k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC, - regs + K210_SYSCTL_EN_PERI); - - k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_SEL0); - - iounmap(regs); + iounmap(sysctl_base); } -SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init); +SOC_EARLY_INIT_DECLARE(k210_soc, "canaan,kendryte-k210", k210_soc_early_init); diff --git a/include/soc/canaan/k210-sysctl.h b/include/soc/canaan/k210-sysctl.h index 2e037db68f35..0c2b2c2dabca 100644 --- a/include/soc/canaan/k210-sysctl.h +++ b/include/soc/canaan/k210-sysctl.h @@ -38,4 +38,6 @@ #define K210_SYSCTL_DMA_SEL1 0x68 /* DMA handshake selector 1 */ #define K210_SYSCTL_POWER_SEL 0x6C /* IO Power Mode Select controller */ +void k210_clk_early_init(void __iomem *regs); + #endif From e7d9fea1c78a60c5cc5b0d708b89f1bfebf429b2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:15 +0900 Subject: [PATCH 245/506] dt-bindings: update MAINTAINERS file Add a reference to the Canaan K210 system controller driver bindings file Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml in the MAINTAINERS file entry for this driver. Signed-off-by: Damien Le Moal Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 54f356e67491..467c8670d815 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3879,6 +3879,7 @@ CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER M: Damien Le Moal L: linux-riscv@lists.infradead.org S: Maintained +F: Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml F: drivers/soc/canaan/ F: include/soc/canaan/ From 11481d6b5783fe4b6a6ba2870e49da4b4ebb2259 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:16 +0900 Subject: [PATCH 246/506] dt-bindings: add Canaan boards compatible strings Introduce the file riscv/canaan.yaml to document compatible strings related to the Canaan Kendryte K210 SoC. The compatible string "canaan,kendryte-k210" used to indicate the use of this SoC to the early SoC init code is added. This new file also defines the compatible strings of all supported boards based on this SoC. Signed-off-by: Damien Le Moal Reviewed-by: Atish Patra Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../devicetree/bindings/riscv/canaan.yaml | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 Documentation/devicetree/bindings/riscv/canaan.yaml diff --git a/Documentation/devicetree/bindings/riscv/canaan.yaml b/Documentation/devicetree/bindings/riscv/canaan.yaml new file mode 100644 index 000000000000..f8f3f286bd55 --- /dev/null +++ b/Documentation/devicetree/bindings/riscv/canaan.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/riscv/canaan.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Canaan SoC-based boards + +maintainers: + - Damien Le Moal + +description: + Canaan Kendryte K210 SoC-based boards + +properties: + $nodename: + const: '/' + compatible: + oneOf: + - items: + - const: sipeed,maix-bit + - const: sipeed,maix-bitm + - const: canaan,kendryte-k210 + + - items: + - const: sipeed,maix-go + - const: canaan,kendryte-k210 + + - items: + - const: sipeed,maix-dock-m1 + - const: sipeed,maix-dock-m1w + - const: canaan,kendryte-k210 + + - items: + - const: sipeed,maixduino + - const: canaan,kendryte-k210 + + - items: + - const: canaan,kendryte-kd233 + - const: canaan,kendryte-k210 + + - items: + - const: canaan,kendryte-k210 + +additionalProperties: true + +... From 7ef71c719eb462edaa6078405654d2447c7a5488 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:17 +0900 Subject: [PATCH 247/506] dt-bindings: update risc-v cpu properties The Canaan Kendryte K210 SoC CPU cores are based on a rocket chip version using a draft verion of the RISC-V ISA specifications. To avoid any confusion with CPU cores using stable specifications, add the compatible string "canaan,k210" for this SoC CPU cores. Also add the "riscv,none" value to the mmu-type property to allow a DT to indicate that the CPU being described does not have an MMU or that it has an MMU that is not usable (which is the case for the K210 SoC). Signed-off-by: Damien Le Moal Reviewed-by: Atish Patra Reviewed-by: Anup Patel Acked-by: Rob Herring Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index eb6843f69f7c..e534f6a7cfa1 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -39,6 +39,7 @@ properties: - sifive,u74 - sifive,u5 - sifive,u7 + - canaan,k210 - const: riscv - const: riscv # Simulator only description: @@ -56,6 +57,7 @@ properties: - riscv,sv32 - riscv,sv39 - riscv,sv48 + - riscv,none riscv,isa: description: From 90ddcd642a41b72498817da9dd21ed09d6e4f8e0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:18 +0900 Subject: [PATCH 248/506] dt-bindings: update sifive plic compatible string Add the compatible string "canaan,k210-plic" to the Sifive plic bindings to indicate the use of the "sifive,plic-1.0.0" IP block in the Canaan Kendryte K210 SoC. The description is also updated to reflect this change, that is, that SoCs from other vendors may also use this plic implementation. Signed-off-by: Damien Le Moal Acked-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../interrupt-controller/sifive,plic-1.0.0.yaml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index b9a61c9f7530..08d5a57ce00f 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -8,10 +8,11 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: SiFive Platform-Level Interrupt Controller (PLIC) description: - SiFive SOCs include an implementation of the Platform-Level Interrupt Controller - (PLIC) high-level specification in the RISC-V Privileged Architecture - specification. The PLIC connects all external interrupts in the system to all - hart contexts in the system, via the external interrupt source in each hart. + SiFive SoCs and other RISC-V SoCs include an implementation of the + Platform-Level Interrupt Controller (PLIC) high-level specification in + the RISC-V Privileged Architecture specification. The PLIC connects all + external interrupts in the system to all hart contexts in the system, via + the external interrupt source in each hart. A hart context is a privilege mode in a hardware execution thread. For example, in an 4 core system with 2-way SMT, you have 8 harts and probably at least two @@ -42,7 +43,9 @@ maintainers: properties: compatible: items: - - const: sifive,fu540-c000-plic + - enum: + - sifive,fu540-c000-plic + - canaan,k210-plic - const: sifive,plic-1.0.0 reg: From c43b5718016bc2eb144e8aa0ddf21887d0709edf Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:19 +0900 Subject: [PATCH 249/506] dt-bindings: update sifive clint compatible string Add the "canaan,k210-clint" compatible string to the Sifive clint bindings to indicate the use of the "sifive,clint0" IP block in the Canaan Kendryte K210 SoC. The description of the compatible string property is also updated to reflect this addition. Signed-off-by: Damien Le Moal Acked-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../devicetree/bindings/timer/sifive,clint.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/sifive,clint.yaml b/Documentation/devicetree/bindings/timer/sifive,clint.yaml index 2a0e9cd9fbcf..a35952f48742 100644 --- a/Documentation/devicetree/bindings/timer/sifive,clint.yaml +++ b/Documentation/devicetree/bindings/timer/sifive,clint.yaml @@ -23,15 +23,19 @@ description: properties: compatible: items: - - const: sifive,fu540-c000-clint + - enum: + - sifive,fu540-c000-clint + - canaan,k210-clint - const: sifive,clint0 description: - Should be "sifive,-clint" and "sifive,clint". + Should be ",-clint" and "sifive,clint". Supported compatible strings are - "sifive,fu540-c000-clint" for the SiFive CLINT v0 as integrated - onto the SiFive FU540 chip, and "sifive,clint0" for the SiFive - CLINT v0 IP block with no chip integration tweaks. + onto the SiFive FU540 chip, "canaan,k210-clint" for the SiFive + CLINT v0 as integrated onto the Canaan Kendryte K210 chip, and + "sifive,clint0" for the SiFive CLINT v0 IP block with no chip + integration tweaks. Please refer to sifive-blocks-ip-versioning.txt for details reg: From 8fc8719c28f72700efdd42d4c3733496a73233e4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:20 +0900 Subject: [PATCH 250/506] dt-bindings: update sifive uart compatible string Add the compatible string "canaan,k210-uarths" to the sifive uart bindings to indicate the use of this IP block in the Canaan Kendryte K210 SoC. Signed-off-by: Damien Le Moal Reviewed-by: Atish Patra Acked-by: Rob Herring Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/serial/sifive-serial.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/serial/sifive-serial.yaml b/Documentation/devicetree/bindings/serial/sifive-serial.yaml index 3ac5c7ff2758..5fa94dacbba9 100644 --- a/Documentation/devicetree/bindings/serial/sifive-serial.yaml +++ b/Documentation/devicetree/bindings/serial/sifive-serial.yaml @@ -20,6 +20,7 @@ properties: - enum: - sifive,fu540-c000-uart - sifive,fu740-c000-uart + - canaan,k210-uarths - const: sifive,uart0 description: From 3933cf6afd4535aa66528de9de007c69195b377e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:21 +0900 Subject: [PATCH 251/506] dt-bindings: fix sifive gpio properties The sifive gpio IP block supports up to 32 GPIOs. Reflect that in the interrupts property description and maxItems. Also add the standard ngpios property to describe the number of GPIOs available on the implementation. Also add the "canaan,k210-gpiohs" compatible string to indicate the use of this gpio controller in the Canaan Kendryte K210 SoC. If this compatible string is used, do not define the clocks property as required as the K210 SoC does not have a software controllable clock for the Sifive gpio IP block. Signed-off-by: Damien Le Moal Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../devicetree/bindings/gpio/sifive,gpio.yaml | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml index ab22056f8b44..c2902aac2514 100644 --- a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml @@ -16,6 +16,7 @@ properties: - enum: - sifive,fu540-c000-gpio - sifive,fu740-c000-gpio + - canaan,k210-gpiohs - const: sifive,gpio0 reg: @@ -23,9 +24,9 @@ properties: interrupts: description: - interrupt mapping one per GPIO. Maximum 16 GPIOs. + Interrupt mapping, one per GPIO. Maximum 32 GPIOs. minItems: 1 - maxItems: 16 + maxItems: 32 interrupt-controller: true @@ -38,6 +39,14 @@ properties: "#gpio-cells": const: 2 + ngpios: + description: + The number of GPIOs available on the controller implementation. + It is 16 for the SiFive SoCs and 32 for the Canaan K210. + minimum: 1 + maximum: 32 + default: 16 + gpio-controller: true required: @@ -46,10 +55,20 @@ required: - interrupts - interrupt-controller - "#interrupt-cells" - - clocks - "#gpio-cells" - gpio-controller +if: + properties: + compatible: + contains: + enum: + - sifive,fu540-c000-gpio + - sifive,fu740-c000-gpio +then: + required: + - clocks + additionalProperties: false examples: From 13dcfae0b23489118654005b9328aa3a5706c859 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:22 +0900 Subject: [PATCH 252/506] dt-bindings: add resets property to dw-apb-timer The Synopsis DesignWare APB timer driver (drivers/clocksource/dw_apb_timer_of.c) indirectly uses the resets property of its node as it executes the function of_reset_control_get(). Make sure that this property is documented in timer/snps,dw-apb-timer.yaml to avoid make dtbs_check warnings. Signed-off-by: Damien Le Moal Reviewed-by: Atish Patra Acked-by: Rob Herring Acked-by: Daniel Lezcano Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/timer/snps,dw-apb-timer.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/snps,dw-apb-timer.yaml b/Documentation/devicetree/bindings/timer/snps,dw-apb-timer.yaml index d65faf289a83..d33c9205a909 100644 --- a/Documentation/devicetree/bindings/timer/snps,dw-apb-timer.yaml +++ b/Documentation/devicetree/bindings/timer/snps,dw-apb-timer.yaml @@ -24,6 +24,9 @@ properties: interrupts: maxItems: 1 + resets: + maxItems: 1 + clocks: minItems: 1 items: From 67d96729a9e789ecfddb0f701e5ec18389758dab Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:23 +0900 Subject: [PATCH 253/506] riscv: Update Canaan Kendryte K210 device tree Update the Canaan Kendryte K210 base device tree k210.dtsi to define all supported peripherals of the SoC, their clocks and reset lines. The device tree file k210.dts is renamed to k210_generic.dts and becomes the default value selection of the configuration option SOC_CANAAN_K210_DTB_BUILTIN_SOURCE. No device beside the serial console is defined by this device tree. This makes this generic device tree suitable for use with a builtin initramfs with all known K210 based boards. These changes result in the K210_CLK_ACLK clock ID to be unused and removed from the dt-bindings k210-clk.h header file. Most updates to the k210.dtsi file come from Sean Anderson's work on U-Boot support for the K210. Cc: Rob Herring Cc: devicetree@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 2 +- arch/riscv/boot/dts/canaan/k210.dts | 23 -- arch/riscv/boot/dts/canaan/k210.dtsi | 395 ++++++++++++++++++-- arch/riscv/boot/dts/canaan/k210_generic.dts | 46 +++ include/dt-bindings/clock/k210-clk.h | 1 - 5 files changed, 414 insertions(+), 53 deletions(-) delete mode 100644 arch/riscv/boot/dts/canaan/k210.dts create mode 100644 arch/riscv/boot/dts/canaan/k210_generic.dts diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 6402746c68f3..7efcece8896c 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -51,7 +51,7 @@ config SOC_CANAAN_K210_DTB_SOURCE string "Source file for the Canaan Kendryte K210 builtin DTB" depends on SOC_CANAAN depends on SOC_CANAAN_K210_DTB_BUILTIN - default "k210" + default "k210_generic" help Base name (without suffix, relative to arch/riscv/boot/dts/canaan) for the DTS file that will be used to produce the DTB linked into the diff --git a/arch/riscv/boot/dts/canaan/k210.dts b/arch/riscv/boot/dts/canaan/k210.dts deleted file mode 100644 index 0d1f28fce6b2..000000000000 --- a/arch/riscv/boot/dts/canaan/k210.dts +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ - -/dts-v1/; - -#include "k210.dtsi" - -/ { - model = "Kendryte K210 generic"; - compatible = "kendryte,k210"; - - chosen { - bootargs = "earlycon console=ttySIF0"; - stdout-path = "serial0"; - }; -}; - -&uarths0 { - status = "okay"; -}; - diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 354b263195a3..5e8ca8142482 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2019 Sean Anderson + * Copyright (C) 2019-20 Sean Anderson * Copyright (C) 2020 Western Digital Corporation or its affiliates. */ #include +#include +#include / { /* @@ -12,14 +14,17 @@ / { */ #address-cells = <1>; #size-cells = <1>; - compatible = "kendryte,k210"; + compatible = "canaan,kendryte-k210"; aliases { serial0 = &uarths0; + serial1 = &uart1; + serial2 = &uart2; + serial3 = &uart3; }; /* - * The K210 has an sv39 MMU following the priviledge specification v1.9. + * The K210 has an sv39 MMU following the privileged specification v1.9. * Since this is a non-ratified draft specification, the kernel does not * support it and the K210 support enabled only for the !MMU case. * Be consistent with this by setting the CPUs MMU type to "none". @@ -30,14 +35,14 @@ cpus { timebase-frequency = <7800000>; cpu0: cpu@0 { device_type = "cpu"; + compatible = "canaan,k210", "riscv"; reg = <0>; - compatible = "kendryte,k210", "sifive,rocket0", "riscv"; riscv,isa = "rv64imafdc"; - mmu-type = "none"; - i-cache-size = <0x8000>; + mmu-type = "riscv,none"; i-cache-block-size = <64>; - d-cache-size = <0x8000>; + i-cache-size = <0x8000>; d-cache-block-size = <64>; + d-cache-size = <0x8000>; cpu0_intc: interrupt-controller { #interrupt-cells = <1>; interrupt-controller; @@ -46,14 +51,14 @@ cpu0_intc: interrupt-controller { }; cpu1: cpu@1 { device_type = "cpu"; + compatible = "canaan,k210", "riscv"; reg = <1>; - compatible = "kendryte,k210", "sifive,rocket0", "riscv"; riscv,isa = "rv64imafdc"; - mmu-type = "none"; - i-cache-size = <0x8000>; + mmu-type = "riscv,none"; i-cache-block-size = <64>; - d-cache-size = <0x8000>; + i-cache-size = <0x8000>; d-cache-block-size = <64>; + d-cache-size = <0x8000>; cpu1_intc: interrupt-controller { #interrupt-cells = <1>; interrupt-controller; @@ -64,10 +69,15 @@ cpu1_intc: interrupt-controller { sram: memory@80000000 { device_type = "memory"; + compatible = "canaan,k210-sram"; reg = <0x80000000 0x400000>, <0x80400000 0x200000>, <0x80600000 0x200000>; reg-names = "sram0", "sram1", "aisram"; + clocks = <&sysclk K210_CLK_SRAM0>, + <&sysclk K210_CLK_SRAM1>, + <&sysclk K210_CLK_AI>; + clock-names = "sram0", "sram1", "aisram"; }; clocks { @@ -81,40 +91,369 @@ in0: oscillator { soc { #address-cells = <1>; #size-cells = <1>; - compatible = "kendryte,k210-soc", "simple-bus"; + compatible = "simple-bus"; ranges; interrupt-parent = <&plic0>; - sysctl: sysctl@50440000 { - compatible = "kendryte,k210-sysctl", "simple-mfd"; - reg = <0x50440000 0x1000>; - #clock-cells = <1>; + rom0: nvmem@1000 { + reg = <0x1000 0x1000>; + read-only; }; - clint0: clint@2000000 { - #interrupt-cells = <1>; - compatible = "riscv,clint0"; + clint0: timer@2000000 { + compatible = "canaan,k210-clint", "sifive,clint0"; reg = <0x2000000 0xC000>; - interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 - &cpu1_intc 3 &cpu1_intc 7>; + interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 + &cpu1_intc 3 &cpu1_intc 7>; }; plic0: interrupt-controller@c000000 { #interrupt-cells = <1>; - interrupt-controller; - compatible = "kendryte,k210-plic0", "riscv,plic0"; + #address-cells = <0>; + compatible = "canaan,k210-plic", "sifive,plic-1.0.0"; reg = <0xC000000 0x4000000>; - interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 0xffffffff>, - <&cpu1_intc 11>, <&cpu1_intc 0xffffffff>; + interrupt-controller; + interrupts-extended = <&cpu0_intc 11 &cpu1_intc 11>; riscv,ndev = <65>; - riscv,max-priority = <7>; }; uarths0: serial@38000000 { - compatible = "kendryte,k210-uarths", "sifive,uart0"; + compatible = "canaan,k210-uarths", "sifive,uart0"; reg = <0x38000000 0x1000>; interrupts = <33>; - clocks = <&sysctl K210_CLK_CPU>; + clocks = <&sysclk K210_CLK_CPU>; + }; + + gpio0: gpio-controller@38001000 { + #interrupt-cells = <2>; + #gpio-cells = <2>; + compatible = "canaan,k210-gpiohs", "sifive,gpio0"; + reg = <0x38001000 0x1000>; + interrupt-controller; + interrupts = <34 35 36 37 38 39 40 41 + 42 43 44 45 46 47 48 49 + 50 51 52 53 54 55 56 57 + 58 59 60 61 62 63 64 65>; + gpio-controller; + ngpios = <32>; + }; + + dmac0: dma-controller@50000000 { + compatible = "snps,axi-dma-1.01a"; + reg = <0x50000000 0x1000>; + interrupts = <27 28 29 30 31 32>; + #dma-cells = <1>; + clocks = <&sysclk K210_CLK_DMA>, <&sysclk K210_CLK_DMA>; + clock-names = "core-clk", "cfgr-clk"; + resets = <&sysrst K210_RST_DMA>; + dma-channels = <6>; + snps,dma-masters = <2>; + snps,priority = <0 1 2 3 4 5>; + snps,data-width = <5>; + snps,block-size = <0x200000 0x200000 0x200000 + 0x200000 0x200000 0x200000>; + snps,axi-max-burst-len = <256>; + }; + + apb0: bus@50200000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "simple-pm-bus"; + ranges; + clocks = <&sysclk K210_CLK_APB0>; + + gpio1: gpio@50200000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dw-apb-gpio"; + reg = <0x50200000 0x80>; + clocks = <&sysclk K210_CLK_APB0>, + <&sysclk K210_CLK_GPIO>; + clock-names = "bus", "db"; + resets = <&sysrst K210_RST_GPIO>; + + gpio1_0: gpio-port@0 { + #gpio-cells = <2>; + #interrupt-cells = <2>; + compatible = "snps,dw-apb-gpio-port"; + reg = <0>; + interrupt-controller; + interrupts = <23>; + gpio-controller; + ngpios = <8>; + }; + }; + + uart1: serial@50210000 { + compatible = "snps,dw-apb-uart"; + reg = <0x50210000 0x100>; + interrupts = <11>; + clocks = <&sysclk K210_CLK_UART1>, + <&sysclk K210_CLK_APB0>; + clock-names = "baudclk", "apb_pclk"; + resets = <&sysrst K210_RST_UART1>; + reg-io-width = <4>; + reg-shift = <2>; + dcd-override; + dsr-override; + cts-override; + ri-override; + }; + + uart2: serial@50220000 { + compatible = "snps,dw-apb-uart"; + reg = <0x50220000 0x100>; + interrupts = <12>; + clocks = <&sysclk K210_CLK_UART2>, + <&sysclk K210_CLK_APB0>; + clock-names = "baudclk", "apb_pclk"; + resets = <&sysrst K210_RST_UART2>; + reg-io-width = <4>; + reg-shift = <2>; + dcd-override; + dsr-override; + cts-override; + ri-override; + }; + + uart3: serial@50230000 { + compatible = "snps,dw-apb-uart"; + reg = <0x50230000 0x100>; + interrupts = <13>; + clocks = <&sysclk K210_CLK_UART3>, + <&sysclk K210_CLK_APB0>; + clock-names = "baudclk", "apb_pclk"; + resets = <&sysrst K210_RST_UART3>; + reg-io-width = <4>; + reg-shift = <2>; + dcd-override; + dsr-override; + cts-override; + ri-override; + }; + + spi2: spi@50240000 { + compatible = "canaan,k210-spi"; + spi-slave; + reg = <0x50240000 0x100>; + #address-cells = <0>; + #size-cells = <0>; + interrupts = <3>; + clocks = <&sysclk K210_CLK_SPI2>, + <&sysclk K210_CLK_APB0>; + clock-names = "ssi_clk", "pclk"; + resets = <&sysrst K210_RST_SPI2>; + spi-max-frequency = <25000000>; + }; + + i2s0: i2s@50250000 { + compatible = "snps,designware-i2s"; + reg = <0x50250000 0x200>; + interrupts = <5>; + clocks = <&sysclk K210_CLK_I2S0>; + clock-names = "i2sclk"; + resets = <&sysrst K210_RST_I2S0>; + }; + + i2s1: i2s@50260000 { + compatible = "snps,designware-i2s"; + reg = <0x50260000 0x200>; + interrupts = <6>; + clocks = <&sysclk K210_CLK_I2S1>; + clock-names = "i2sclk"; + resets = <&sysrst K210_RST_I2S1>; + }; + + i2s2: i2s@50270000 { + compatible = "snps,designware-i2s"; + reg = <0x50270000 0x200>; + interrupts = <7>; + clocks = <&sysclk K210_CLK_I2S2>; + clock-names = "i2sclk"; + resets = <&sysrst K210_RST_I2S2>; + }; + + i2c0: i2c@50280000 { + compatible = "snps,designware-i2c"; + reg = <0x50280000 0x100>; + interrupts = <8>; + clocks = <&sysclk K210_CLK_I2C0>, + <&sysclk K210_CLK_APB0>; + clock-names = "ref", "pclk"; + resets = <&sysrst K210_RST_I2C0>; + }; + + i2c1: i2c@50290000 { + compatible = "snps,designware-i2c"; + reg = <0x50290000 0x100>; + interrupts = <9>; + clocks = <&sysclk K210_CLK_I2C1>, + <&sysclk K210_CLK_APB0>; + clock-names = "ref", "pclk"; + resets = <&sysrst K210_RST_I2C1>; + }; + + i2c2: i2c@502a0000 { + compatible = "snps,designware-i2c"; + reg = <0x502A0000 0x100>; + interrupts = <10>; + clocks = <&sysclk K210_CLK_I2C2>, + <&sysclk K210_CLK_APB0>; + clock-names = "ref", "pclk"; + resets = <&sysrst K210_RST_I2C2>; + }; + + fpioa: pinmux@502b0000 { + compatible = "canaan,k210-fpioa"; + reg = <0x502B0000 0x100>; + clocks = <&sysclk K210_CLK_FPIOA>, + <&sysclk K210_CLK_APB0>; + clock-names = "ref", "pclk"; + resets = <&sysrst K210_RST_FPIOA>; + canaan,k210-sysctl-power = <&sysctl 108>; + }; + + timer0: timer@502d0000 { + compatible = "snps,dw-apb-timer"; + reg = <0x502D0000 0x100>; + interrupts = <14 15>; + clocks = <&sysclk K210_CLK_TIMER0>, + <&sysclk K210_CLK_APB0>; + clock-names = "timer", "pclk"; + resets = <&sysrst K210_RST_TIMER0>; + }; + + timer1: timer@502e0000 { + compatible = "snps,dw-apb-timer"; + reg = <0x502E0000 0x100>; + interrupts = <16 17>; + clocks = <&sysclk K210_CLK_TIMER1>, + <&sysclk K210_CLK_APB0>; + clock-names = "timer", "pclk"; + resets = <&sysrst K210_RST_TIMER1>; + }; + + timer2: timer@502f0000 { + compatible = "snps,dw-apb-timer"; + reg = <0x502F0000 0x100>; + interrupts = <18 19>; + clocks = <&sysclk K210_CLK_TIMER2>, + <&sysclk K210_CLK_APB0>; + clock-names = "timer", "pclk"; + resets = <&sysrst K210_RST_TIMER2>; + }; + }; + + apb1: bus@50400000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "simple-pm-bus"; + ranges; + clocks = <&sysclk K210_CLK_APB1>; + + wdt0: watchdog@50400000 { + compatible = "snps,dw-wdt"; + reg = <0x50400000 0x100>; + interrupts = <21>; + clocks = <&sysclk K210_CLK_WDT0>, + <&sysclk K210_CLK_APB1>; + clock-names = "tclk", "pclk"; + resets = <&sysrst K210_RST_WDT0>; + }; + + wdt1: watchdog@50410000 { + compatible = "snps,dw-wdt"; + reg = <0x50410000 0x100>; + interrupts = <22>; + clocks = <&sysclk K210_CLK_WDT1>, + <&sysclk K210_CLK_APB1>; + clock-names = "tclk", "pclk"; + resets = <&sysrst K210_RST_WDT1>; + }; + + sysctl: syscon@50440000 { + compatible = "canaan,k210-sysctl", + "syscon", "simple-mfd"; + reg = <0x50440000 0x100>; + clocks = <&sysclk K210_CLK_APB1>; + clock-names = "pclk"; + + sysclk: clock-controller { + #clock-cells = <1>; + compatible = "canaan,k210-clk"; + clocks = <&in0>; + }; + + sysrst: reset-controller { + compatible = "canaan,k210-rst"; + #reset-cells = <1>; + }; + + reboot: syscon-reboot { + compatible = "syscon-reboot"; + regmap = <&sysctl>; + offset = <48>; + mask = <1>; + value = <1>; + }; + }; + }; + + apb2: bus@52000000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "simple-pm-bus"; + ranges; + clocks = <&sysclk K210_CLK_APB2>; + + spi0: spi@52000000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "canaan,k210-spi"; + reg = <0x52000000 0x100>; + interrupts = <1>; + clocks = <&sysclk K210_CLK_SPI0>, + <&sysclk K210_CLK_APB2>; + clock-names = "ssi_clk", "pclk"; + resets = <&sysrst K210_RST_SPI0>; + reset-names = "spi"; + spi-max-frequency = <25000000>; + num-cs = <4>; + reg-io-width = <4>; + }; + + spi1: spi@53000000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "canaan,k210-spi"; + reg = <0x53000000 0x100>; + interrupts = <2>; + clocks = <&sysclk K210_CLK_SPI1>, + <&sysclk K210_CLK_APB2>; + clock-names = "ssi_clk", "pclk"; + resets = <&sysrst K210_RST_SPI1>; + reset-names = "spi"; + spi-max-frequency = <25000000>; + num-cs = <4>; + reg-io-width = <4>; + }; + + spi3: spi@54000000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwc-ssi-1.01a"; + reg = <0x54000000 0x200>; + interrupts = <4>; + clocks = <&sysclk K210_CLK_SPI3>, + <&sysclk K210_CLK_APB2>; + clock-names = "ssi_clk", "pclk"; + resets = <&sysrst K210_RST_SPI3>; + reset-names = "spi"; + /* Could possibly go up to 200 MHz */ + spi-max-frequency = <100000000>; + num-cs = <4>; + reg-io-width = <4>; + }; }; }; }; diff --git a/arch/riscv/boot/dts/canaan/k210_generic.dts b/arch/riscv/boot/dts/canaan/k210_generic.dts new file mode 100644 index 000000000000..396c8ca4d24d --- /dev/null +++ b/arch/riscv/boot/dts/canaan/k210_generic.dts @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include + +/ { + model = "Kendryte K210 generic"; + compatible = "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; +}; + +&fpioa { + pinctrl-0 = <&jtag_pins>; + pinctrl-names = "default"; + status = "okay"; + + jtag_pins: jtag-pinmux { + pinmux = , + , + , + ; + }; + + uarths_pins: uarths-pinmux { + pinmux = , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pins>; + pinctrl-names = "default"; + status = "okay"; +}; diff --git a/include/dt-bindings/clock/k210-clk.h b/include/dt-bindings/clock/k210-clk.h index a48176ad3c23..b2de702cbf75 100644 --- a/include/dt-bindings/clock/k210-clk.h +++ b/include/dt-bindings/clock/k210-clk.h @@ -9,7 +9,6 @@ /* * Kendryte K210 SoC clock identifiers (arbitrary values). */ -#define K210_CLK_ACLK 0 #define K210_CLK_CPU 0 #define K210_CLK_SRAM0 1 #define K210_CLK_SRAM1 2 From 97c279bcf813caa5f4a7aa2636c1be77a9e29afc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:24 +0900 Subject: [PATCH 254/506] riscv: Add SiPeed MAIX BiT board device tree Add the device tree sipeed_maix_bit.dts for the SiPeed MAIX BiT and MAIX BiTm boards. This device tree enables LEDs, gpio, i2c and spi/mmc SD card devices. Signed-off-by: Damien Le Moal [Palmer: Remove undocumented microphone entry, along with the use.] Signed-off-by: Palmer Dabbelt --- .../riscv/boot/dts/canaan/sipeed_maix_bit.dts | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts new file mode 100644 index 000000000000..0bcaf35045e7 --- /dev/null +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include +#include + +/ { + model = "SiPeed MAIX BiT"; + compatible = "sipeed,maix-bit", "sipeed,maix-bitm", + "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; + + gpio-leds { + compatible = "gpio-leds"; + + led0 { + color = ; + label = "green"; + gpios = <&gpio1_0 4 GPIO_ACTIVE_LOW>; + }; + + led1 { + color = ; + label = "red"; + gpios = <&gpio1_0 5 GPIO_ACTIVE_LOW>; + }; + + led2 { + color = ; + label = "blue"; + gpios = <&gpio1_0 6 GPIO_ACTIVE_LOW>; + }; + }; + + gpio-keys { + compatible = "gpio-keys"; + + boot { + label = "BOOT"; + linux,code = ; + gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; + }; + }; +}; + +&fpioa { + pinctrl-names = "default"; + pinctrl-0 = <&jtag_pinctrl>; + status = "okay"; + + jtag_pinctrl: jtag-pinmux { + pinmux = , + , + , + ; + }; + + uarths_pinctrl: uarths-pinmux { + pinmux = , + ; + }; + + gpio_pinctrl: gpio-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + gpiohs_pinctrl: gpiohs-pinmux { + pinmux = , + , + , + , + , + , + , + , + , + , + ; + }; + + i2s0_pinctrl: i2s0-pinmux { + pinmux = , + , + ; + }; + + dvp_pinctrl: dvp-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + spi0_pinctrl: spi0-pinmux { + pinmux = , /* cs */ + , /* rst */ + , /* dc */ + ; /* wr */ + }; + + spi1_pinctrl: spi1-pinmux { + pinmux = , + , + , + ; /* cs */ + }; + + i2c1_pinctrl: i2c1-pinmux { + pinmux = , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio0 { + pinctrl-0 = <&gpiohs_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio1 { + pinctrl-0 = <&gpio_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&i2s0 { + #sound-dai-cells = <1>; + pinctrl-0 = <&i2s0_pinctrl>; + pinctrl-names = "default"; +}; + +&i2c1 { + pinctrl-0 = <&i2c1_pinctrl>; + pinctrl-names = "default"; + clock-frequency = <400000>; + status = "okay"; +}; + +&spi0 { + pinctrl-0 = <&spi0_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + + panel@0 { + compatible = "sitronix,st7789v"; + reg = <0>; + reset-gpios = <&gpio0 21 GPIO_ACTIVE_LOW>; + dc-gpios = <&gpio0 22 GPIO_ACTIVE_HIGH>; + spi-max-frequency = <15000000>; + spi-cs-high; + status = "disabled"; + }; +}; + +&spi1 { + pinctrl-0 = <&spi1_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>; + status = "okay"; + + slot@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <25000000>; + broken-cd; + }; +}; + +&spi3 { + spi-flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <50000000>; + m25p,fast-read; + broken-flash-reset; + }; +}; From a40f920964c4edef3885cd7fe944033687039f69 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:25 +0900 Subject: [PATCH 255/506] riscv: Add SiPeed MAIX DOCK board device tree Add the device tree sipeed_maix_dock.dts for the SiPeed MAIX DOCK m1 and m1w boards. This device tree enables LEDs, gpio, i2c and spi/mmc SD card devices. Signed-off-by: Damien Le Moal [Palmer: Remove undocumented microphone entry, along with the use.] Signed-off-by: Palmer Dabbelt --- .../boot/dts/canaan/sipeed_maix_dock.dts | 211 ++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts new file mode 100644 index 000000000000..ac8a03f5867a --- /dev/null +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include +#include + +/ { + model = "SiPeed MAIX Dock"; + compatible = "sipeed,maix-dock-m1", "sipeed,maix-dock-m1w", + "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; + + gpio-leds { + compatible = "gpio-leds"; + + /* + * Note: the board wiring drawing documents green on + * gpio #4, red on gpio #5 and blue on gpio #6. However, + * the board is actually wired differently as defined here. + */ + led0 { + color = ; + label = "blue"; + gpios = <&gpio1_0 4 GPIO_ACTIVE_LOW>; + }; + + led1 { + color = ; + label = "green"; + gpios = <&gpio1_0 5 GPIO_ACTIVE_LOW>; + }; + + led2 { + color = ; + label = "red"; + gpios = <&gpio1_0 6 GPIO_ACTIVE_LOW>; + }; + }; + + gpio-keys { + compatible = "gpio-keys"; + + boot { + label = "BOOT"; + linux,code = ; + gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; + }; + }; +}; + +&fpioa { + pinctrl-0 = <&jtag_pinctrl>; + pinctrl-names = "default"; + status = "okay"; + + jtag_pinctrl: jtag-pinmux { + pinmux = , + , + , + ; + }; + + uarths_pinctrl: uarths-pinmux { + pinmux = , + ; + }; + + gpio_pinctrl: gpio-pinmux { + pinmux = , + , + , + , + , + ; + }; + + gpiohs_pinctrl: gpiohs-pinmux { + pinmux = , + , + , + , + , + , + , + , + , + , + ; + }; + + i2s0_pinctrl: i2s0-pinmux { + pinmux = , + , + ; + }; + + dvp_pinctrl: dvp-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + spi0_pinctrl: spi0-pinmux { + pinmux = , /* cs */ + , /* rst */ + , /* dc */ + ; /* wr */ + }; + + spi1_pinctrl: spi1-pinmux { + pinmux = , + , + , + ; /* cs */ + }; + + i2c1_pinctrl: i2c1-pinmux { + pinmux = , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio0 { + pinctrl-0 = <&gpiohs_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio1 { + pinctrl-0 = <&gpio_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&i2s0 { + #sound-dai-cells = <1>; + pinctrl-0 = <&i2s0_pinctrl>; + pinctrl-names = "default"; +}; + +&i2c1 { + pinctrl-0 = <&i2c1_pinctrl>; + pinctrl-names = "default"; + clock-frequency = <400000>; + status = "okay"; +}; + +&spi0 { + pinctrl-0 = <&spi0_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + + panel@0 { + compatible = "sitronix,st7789v"; + reg = <0>; + reset-gpios = <&gpio0 21 GPIO_ACTIVE_LOW>; + dc-gpios = <&gpio0 22 0>; + spi-max-frequency = <15000000>; + status = "disabled"; + }; +}; + +&spi1 { + pinctrl-0 = <&spi1_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>; + status = "okay"; + + slot@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <25000000>; + broken-cd; + }; +}; + +&spi3 { + spi-flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <50000000>; + m25p,fast-read; + broken-flash-reset; + }; +}; From 8194f08bda18329d527abe0d767b031a108b7121 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:26 +0900 Subject: [PATCH 256/506] riscv: Add SiPeed MAIX GO board device tree Add the device tree sipeed_maix_go.dts for the SiPeed MAIX GO board. This device tree enables buttons, LEDs, gpio, i2c and spi/mmc SD card devices. Signed-off-by: Damien Le Moal [Palmer: Remove undocumented microphone entry, along with the use.] Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/canaan/sipeed_maix_go.dts | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 arch/riscv/boot/dts/canaan/sipeed_maix_go.dts diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts new file mode 100644 index 000000000000..623998194bc1 --- /dev/null +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include +#include + +/ { + model = "SiPeed MAIX GO"; + compatible = "sipeed,maix-go", "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; + + gpio-leds { + compatible = "gpio-leds"; + + led0 { + color = ; + label = "green"; + gpios = <&gpio1_0 4 GPIO_ACTIVE_LOW>; + }; + + led1 { + color = ; + label = "red"; + gpios = <&gpio1_0 5 GPIO_ACTIVE_LOW>; + }; + + led2 { + color = ; + label = "blue"; + gpios = <&gpio1_0 6 GPIO_ACTIVE_LOW>; + }; + }; + + gpio-keys { + compatible = "gpio-keys"; + + up { + label = "UP"; + linux,code = ; + gpios = <&gpio1_0 7 GPIO_ACTIVE_LOW>; + }; + + press { + label = "PRESS"; + linux,code = ; + gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; + }; + + down { + label = "DOWN"; + linux,code = ; + gpios = <&gpio0 1 GPIO_ACTIVE_LOW>; + }; + }; +}; + +&fpioa { + pinctrl-0 = <&jtag_pinctrl>; + pinctrl-names = "default"; + status = "okay"; + + jtag_pinctrl: jtag-pinmux { + pinmux = , + , + , + ; + }; + + uarths_pinctrl: uarths-pinmux { + pinmux = , + ; + }; + + gpio_pinctrl: gpio-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + gpiohs_pinctrl: gpiohs-pinmux { + pinmux = , + , + , + , + , + , + , + , + , + , + ; + }; + + i2s0_pinctrl: i2s0-pinmux { + pinmux = , + , + ; + }; + + dvp_pinctrl: dvp-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + spi0_pinctrl: spi0-pinmux { + pinmux = , /* cs */ + , /* rst */ + , /* dc */ + ; /* wr */ + }; + + spi1_pinctrl: spi1-pinmux { + pinmux = , + , + , + ; /* cs */ + }; + + i2c1_pinctrl: i2c1-pinmux { + pinmux = , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio0 { + pinctrl-0 = <&gpiohs_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio1 { + pinctrl-0 = <&gpio_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&i2s0 { + #sound-dai-cells = <1>; + pinctrl-0 = <&i2s0_pinctrl>; + pinctrl-names = "default"; +}; + +&i2c1 { + pinctrl-0 = <&i2c1_pinctrl>; + pinctrl-names = "default"; + clock-frequency = <400000>; + status = "okay"; +}; + +&spi0 { + pinctrl-0 = <&spi0_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + + panel@0 { + compatible = "sitronix,st7789v"; + reg = <0>; + reset-gpios = <&gpio0 21 GPIO_ACTIVE_LOW>; + dc-gpios = <&gpio0 22 GPIO_ACTIVE_HIGH>; + spi-max-frequency = <15000000>; + status = "disabled"; + }; +}; + +&spi1 { + pinctrl-0 = <&spi1_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>; + status = "okay"; + + slot@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <25000000>; + broken-cd; + }; +}; + +&spi3 { + spi-flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <50000000>; + m25p,fast-read; + broken-flash-reset; + }; +}; From 8f5b0e79f3e5cd3e76022bf6451c17cd6509ddfe Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:27 +0900 Subject: [PATCH 257/506] riscv: Add SiPeed MAIXDUINO board device tree Add the device tree sipeed_maixduino.dts for the SiPeed MAIXDUINO board. This device tree enables LEDs and spi/mmc SD card device. Additionally, gpios and i2c are also enabled and mapped to the board header pins as indicated on the board itself. Signed-off-by: Damien Le Moal [Palmer: Remove undocumented microphone entry, along with the use.] Signed-off-by: Palmer Dabbelt --- .../boot/dts/canaan/sipeed_maixduino.dts | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 arch/riscv/boot/dts/canaan/sipeed_maixduino.dts diff --git a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts new file mode 100644 index 000000000000..cf605ba0d67e --- /dev/null +++ b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include + +/ { + model = "SiPeed MAIXDUINO"; + compatible = "sipeed,maixduino", "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; + + gpio-keys { + compatible = "gpio-keys"; + + boot { + label = "BOOT"; + linux,code = ; + gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; + }; + }; + + vcc_3v3: regulator-3v3 { + compatible = "regulator-fixed"; + regulator-name = "3v3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; +}; + +&fpioa { + status = "okay"; + + uarths_pinctrl: uarths-pinmux { + pinmux = , /* Header "0" */ + ; /* Header "1" */ + }; + + gpio_pinctrl: gpio-pinmux { + pinmux = , + ; + }; + + gpiohs_pinctrl: gpiohs-pinmux { + pinmux = , /* BOOT */ + , /* Header "2" */ + , /* Header "3" */ + , /* Header "4" */ + , /* Header "5" */ + , /* Header "6" */ + , /* Header "7" */ + , /* Header "8" */ + , /* Header "9" */ + , /* Header "10" */ + , /* Header "11" */ + , /* Header "12" */ + ; /* Header "13" */ + }; + + i2s0_pinctrl: i2s0-pinmux { + pinmux = , + , + ; + }; + + spi1_pinctrl: spi1-pinmux { + pinmux = , + , + , + ; /* cs */ + }; + + i2c1_pinctrl: i2c1-pinmux { + pinmux = , /* Header "scl" */ + ; /* Header "sda" */ + }; + + i2s1_pinctrl: i2s1-pinmux { + pinmux = , + , + ; + }; + + spi0_pinctrl: spi0-pinmux { + pinmux = , /* cs */ + , /* rst */ + , /* dc */ + ; /* wr */ + }; + + dvp_pinctrl: dvp-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio0 { + pinctrl-0 = <&gpiohs_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio1 { + pinctrl-0 = <&gpio_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&i2s0 { + #sound-dai-cells = <1>; + pinctrl-0 = <&i2s0_pinctrl>; + pinctrl-names = "default"; +}; + +&i2c1 { + pinctrl-0 = <&i2c1_pinctrl>; + pinctrl-names = "default"; + clock-frequency = <400000>; + status = "okay"; +}; + +&spi0 { + pinctrl-0 = <&spi0_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + + panel@0 { + compatible = "sitronix,st7789v"; + reg = <0>; + reset-gpios = <&gpio0 21 GPIO_ACTIVE_LOW>; + dc-gpios = <&gpio0 22 0>; + spi-max-frequency = <15000000>; + power-supply = <&vcc_3v3>; + }; +}; + +&spi1 { + pinctrl-0 = <&spi1_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio1_0 2 GPIO_ACTIVE_LOW>; + status = "okay"; + + slot@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <25000000>; + broken-cd; + }; +}; + +&spi3 { + spi-flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <50000000>; + m25p,fast-read; + broken-flash-reset; + }; +}; From 62363a8e2f56e1797a95e01dd592927aed480035 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:28 +0900 Subject: [PATCH 258/506] riscv: Add Kendryte KD233 board device tree Add the device tree canaan_kd233.dts for the Canaan Kendryte KD233 development board. This device tree enables LEDs, some gpios and spi/mmc SD card device. The WS2812B RGB LED and the 10 positions rotary dip switch present on the board are left undefined. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel [Palmer: Remove undocumented microphone entry, along with the use.] Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/canaan/canaan_kd233.dts | 152 ++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 arch/riscv/boot/dts/canaan/canaan_kd233.dts diff --git a/arch/riscv/boot/dts/canaan/canaan_kd233.dts b/arch/riscv/boot/dts/canaan/canaan_kd233.dts new file mode 100644 index 000000000000..039b92abf046 --- /dev/null +++ b/arch/riscv/boot/dts/canaan/canaan_kd233.dts @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +#include +#include + +/ { + model = "Kendryte KD233"; + compatible = "canaan,kendryte-kd233", "canaan,kendryte-k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0:115200n8"; + }; + + gpio-leds { + compatible = "gpio-leds"; + + led0 { + gpios = <&gpio0 8 GPIO_ACTIVE_LOW>; + }; + + led1 { + gpios = <&gpio0 9 GPIO_ACTIVE_LOW>; + }; + }; + + gpio-keys { + compatible = "gpio-keys"; + + key0 { + label = "KEY0"; + linux,code = ; + gpios = <&gpio0 10 GPIO_ACTIVE_LOW>; + }; + }; +}; + +&fpioa { + pinctrl-0 = <&jtag_pinctrl>; + pinctrl-names = "default"; + status = "okay"; + + jtag_pinctrl: jtag-pinmux { + pinmux = , + , + , + ; + }; + + uarths_pinctrl: uarths-pinmux { + pinmux = , + ; + }; + + spi0_pinctrl: spi0-pinmux { + pinmux = , /* cs */ + , /* wr */ + ; /* dc */ + }; + + dvp_pinctrl: dvp-pinmux { + pinmux = , + , + , + , + , + , + , + ; + }; + + gpiohs_pinctrl: gpiohs-pinmux { + pinmux = , + , /* Rot. dip sw line 8 */ + , /* Rot. dip sw line 4 */ + , /* Rot. dip sw line 2 */ + , /* Rot. dip sw line 1 */ + , + , + ; + }; + + spi1_pinctrl: spi1-pinmux { + pinmux = , + , + , + ; /* cs */ + }; + + i2s0_pinctrl: i2s0-pinmux { + pinmux = , + , + ; + }; +}; + +&uarths0 { + pinctrl-0 = <&uarths_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&gpio0 { + pinctrl-0 = <&gpiohs_pinctrl>; + pinctrl-names = "default"; + status = "okay"; +}; + +&i2s0 { + #sound-dai-cells = <1>; + pinctrl-0 = <&i2s0_pinctrl>; + pinctrl-names = "default"; +}; + +&spi0 { + pinctrl-0 = <&spi0_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + + panel@0 { + compatible = "ilitek,ili9341"; + reg = <0>; + dc-gpios = <&gpio0 21 GPIO_ACTIVE_HIGH>; + spi-max-frequency = <15000000>; + status = "disabled"; + }; +}; + +&spi1 { + pinctrl-0 = <&spi1_pinctrl>; + pinctrl-names = "default"; + num-cs = <1>; + cs-gpios = <&gpio0 16 GPIO_ACTIVE_LOW>; + status = "okay"; + + slot@0 { + compatible = "mmc-spi-slot"; + reg = <0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <25000000>; + broken-cd; + }; +}; From aec3a94d951fc82c209c36e89dda5b5fdea0f4c5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:29 +0900 Subject: [PATCH 259/506] riscv: Update Canaan Kendryte K210 defconfig Update the Kendryte k210 nommu default configuration file (nommu_k210_defconfig) to include device drivers for reset, reboot, I2C, SPI, gpio and LEDs support. Virtual Terminal support is also disabled as no terminal devices are supported and enabled. Disabling CONFIG_VT (removing the no longer needed override for CONFIG_VGA_CONSOLE) reduces the kernel image size by about 65 KB. This default configuration remains suitable for a system using an initramfs cpio file linked into the kernel image. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/nommu_k210_defconfig | 44 ++++++++++++++++++++----- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index 368a28cf1467..b16a2a12c82a 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -1,17 +1,19 @@ # CONFIG_CPU_ISOLATION is not set -CONFIG_LOG_BUF_SHIFT=15 +CONFIG_LOG_BUF_SHIFT=13 CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_FORCE=y +# CONFIG_RD_GZIP is not set # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set # CONFIG_RD_XZ is not set # CONFIG_RD_LZO is not set # CONFIG_RD_LZ4 is not set +# CONFIG_RD_ZSTD is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_SYSFS_SYSCALL is not set # CONFIG_FHANDLE is not set # CONFIG_BASE_FULL is not set +# CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set # CONFIG_TIMERFD is not set @@ -25,15 +27,17 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLOB=y -# CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y +CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0" CONFIG_CMDLINE_FORCE=y -CONFIG_JUMP_LABEL=y +# CONFIG_SECCOMP is not set +# CONFIG_STACKPROTECTOR is not set +# CONFIG_GCC_PLUGINS is not set # CONFIG_BLOCK is not set CONFIG_BINFMT_FLAT=y # CONFIG_COREDUMP is not set @@ -41,23 +45,47 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_FW_LOADER is not set # CONFIG_ALLOW_DEV_COREDUMP is not set -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set +# CONFIG_INPUT is not set # CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_UNIX98_PTYS is not set # CONFIG_LEGACY_PTYS is not set # CONFIG_LDISC_AUTOLOAD is not set # CONFIG_HW_RANDOM is not set # CONFIG_DEVMEM is not set +CONFIG_I2C=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_SPI=y +# CONFIG_SPI_MEM is not set +CONFIG_SPI_DESIGNWARE=y +CONFIG_SPI_DW_MMIO=y +# CONFIG_GPIO_CDEV_V1 is not set +CONFIG_GPIO_DWAPB=y +CONFIG_GPIO_SIFIVE=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_SYSCON=y # CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID is not set # CONFIG_USB_SUPPORT is not set +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_GPIO=y +CONFIG_LEDS_USER=y # CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set +# CONFIG_SURFACE_PLATFORMS is not set +# CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set CONFIG_LSM="[]" CONFIG_PRINTK_TIME=y +# CONFIG_SYMBOLIC_ERRNAME is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +# CONFIG_FRAME_POINTER is not set # CONFIG_DEBUG_MISC is not set CONFIG_PANIC_ON_OOPS=y # CONFIG_SCHED_DEBUG is not set From 7e09fd3994c5bd00ebd22d7ec207dd05da3bf7fb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Feb 2021 14:02:30 +0900 Subject: [PATCH 260/506] riscv: Add Canaan Kendryte K210 SD card defconfig The nommu_k210_defconfig default configuration allows booting a Canaan Kendryte K210 SoC based boards using an embedded intramfs cpio file. Modifying this configuration to enable support for the board SD card is not trivial for all users. To help beginners getting started with these boards, add the nommu_k210_sdcard_defconfig default configuration file to set all configuration options necessary to use the board mmc-spi sd card for the root file system. This new configuration adds support for the block layer, the mmc-spi driver and modifies the boot options to specify the rootfs device as mmcblk0p1 (first partition of the sd card block device). The ext2 file system is selected by default to encourage its use as that results in only about 4KB added to the kernel image size. As ext2 does not have journaling, the boot options specify a read-only mount of the file system. Similarly to the smaller nommu_k210_defconfig, this new default configuration disables virtual terminal support to reduce the kernel image size. The default device tree selected is unchanged, specifying the simple "k210_generic" device tree file. The user must change this setting to specify the device tree suitable for the board being used (sipeed_maix_bit, sipeed_maix_dock, sipeed_maix_go, sipeed_maixduino or canaan_kd233). Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- .../riscv/configs/nommu_k210_sdcard_defconfig | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 arch/riscv/configs/nommu_k210_sdcard_defconfig diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig new file mode 100644 index 000000000000..61f887f65419 --- /dev/null +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -0,0 +1,92 @@ +# CONFIG_CPU_ISOLATION is not set +CONFIG_LOG_BUF_SHIFT=13 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_FHANDLE is not set +# CONFIG_BASE_FULL is not set +# CONFIG_FUTEX is not set +# CONFIG_EPOLL is not set +# CONFIG_SIGNALFD is not set +# CONFIG_TIMERFD is not set +# CONFIG_EVENTFD is not set +# CONFIG_AIO is not set +# CONFIG_IO_URING is not set +# CONFIG_ADVISE_SYSCALLS is not set +# CONFIG_MEMBARRIER is not set +# CONFIG_KALLSYMS is not set +CONFIG_EMBEDDED=y +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SLOB=y +# CONFIG_MMU is not set +CONFIG_SOC_CANAAN=y +CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" +CONFIG_MAXPHYSMEM_2GB=y +CONFIG_SMP=y +CONFIG_NR_CPUS=2 +CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro" +CONFIG_CMDLINE_FORCE=y +# CONFIG_SECCOMP is not set +# CONFIG_STACKPROTECTOR is not set +# CONFIG_GCC_PLUGINS is not set +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_MQ_IOSCHED_DEADLINE is not set +# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_BINFMT_FLAT=y +# CONFIG_COREDUMP is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_FW_LOADER is not set +# CONFIG_ALLOW_DEV_COREDUMP is not set +# CONFIG_BLK_DEV is not set +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_LDISC_AUTOLOAD is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_DEVMEM is not set +CONFIG_I2C=y +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_SPI=y +# CONFIG_SPI_MEM is not set +CONFIG_SPI_DESIGNWARE=y +CONFIG_SPI_DW_MMIO=y +# CONFIG_GPIO_CDEV_V1 is not set +CONFIG_GPIO_DWAPB=y +CONFIG_GPIO_SIFIVE=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_SYSCON=y +# CONFIG_HWMON is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_MMC=y +# CONFIG_PWRSEQ_EMMC is not set +# CONFIG_PWRSEQ_SIMPLE is not set +CONFIG_MMC_SPI=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_GPIO=y +CONFIG_LEDS_USER=y +# CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set +# CONFIG_SURFACE_PLATFORMS is not set +CONFIG_EXT2_FS=y +# CONFIG_FILE_LOCKING is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_LSM="[]" +CONFIG_PRINTK_TIME=y +# CONFIG_SYMBOLIC_ERRNAME is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_DEBUG_MISC is not set +CONFIG_PANIC_ON_OOPS=y +# CONFIG_SCHED_DEBUG is not set +# CONFIG_RCU_TRACE is not set +# CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set From cc937cad14fb219770eb593a3e98b6b0d6fd96fd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sun, 7 Feb 2021 16:10:53 +0800 Subject: [PATCH 261/506] riscv: Remove unnecessary declaration max_low_pfn and min_low_pfn are declared in linux/memblock.h, and it also is included in arch/riscv/mm/init.c, drop unnecessary declaration. Signed-off-by: Kefeng Wang Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..e49d51b97bc1 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -97,9 +97,6 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ -extern unsigned long max_low_pfn; -extern unsigned long min_low_pfn; - #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) #define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) From f3d60f2a25e4417e1676161fe42115de3e3f98a2 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 16 Feb 2021 18:33:05 +0100 Subject: [PATCH 262/506] riscv: Disable KSAN_SANITIZE for vDSO We use the generic C VDSO implementations of a handful of clock-related functions. When kasan is enabled this results in asan stub calls that are unlikely to be resolved by userspace, this just disables KASAN when building the VDSO. Verified the fix on a kernel with KASAN enabled using vDSO selftests. Link: https://lore.kernel.org/lkml/CACT4Y+ZNJBnkKHXUf=tm_yuowvZvHwN=0rmJ=7J+xFd+9r_6pQ@mail.gmail.com/ Tested-by: Tobias Klauser Signed-off-by: Tobias Klauser Tested-by: Dmitry Vyukov [Palmer: commit text] Fixes: ad5d1122b82f ("riscv: use vDSO common flow to reduce the latency of the time-related functions") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 0cfd6da784f8..71a315e73cbe 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -32,9 +32,10 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os -# Disable gcov profiling for VDSO code +# Disable profiling and instrumentation for VDSO code GCOV_PROFILE := n KCOV_INSTRUMENT := n +KASAN_SANITIZE := n # Force dependency $(obj)/vdso.o: $(obj)/vdso.so From 433dfc99aa3e0acbf655b961d98eb690162f758f Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 18 Feb 2021 20:21:06 +0200 Subject: [PATCH 263/506] dpaa_eth: fix the access method for the dpaa_napi_portal The current use of container_of is flawed and unnecessary. Obtain the dpaa_napi_portal reference from the private percpu data instead. Fixes: a1e031ffb422 ("dpaa_eth: add XDP_REDIRECT support") Reported-by: Sascha Hauer Signed-off-by: Camelia Groza Acked-by: Madalin Bucur Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20210218182106.22613-1-camelia.groza@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index ccfe52a50a66..720dc99bd1fc 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2670,7 +2670,6 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, u32 hash; u64 ns; - np = container_of(&portal, struct dpaa_napi_portal, p); dpaa_fq = container_of(fq, struct dpaa_fq, fq_base); fd_status = be32_to_cpu(fd->status); fd_format = qm_fd_get_format(fd); @@ -2685,6 +2684,7 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, percpu_priv = this_cpu_ptr(priv->percpu_priv); percpu_stats = &percpu_priv->stats; + np = &percpu_priv->np; if (unlikely(dpaa_eth_napi_schedule(percpu_priv, portal, sched_napi))) return qman_cb_dqrr_stop; From e134d426e1a3b854cb6b62fad818677e58b087d5 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 22 Feb 2021 18:18:58 -0800 Subject: [PATCH 264/506] soc: canaan: Sort the Makefile alphabetically The rest of these are alphabetically sorted, and leaving it this way causes a merge conflict. Reviewed-by: Damien Le Moal Signed-off-by: Palmer Dabbelt --- drivers/soc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile index fa7071246546..34b23645be14 100644 --- a/drivers/soc/Makefile +++ b/drivers/soc/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_ARCH_ACTIONS) += actions/ obj-y += aspeed/ obj-$(CONFIG_ARCH_AT91) += atmel/ obj-y += bcm/ +obj-$(CONFIG_SOC_CANAAN) += canaan/ obj-$(CONFIG_ARCH_DOVE) += dove/ obj-$(CONFIG_MACH_DOVE) += dove/ obj-y += fsl/ @@ -29,4 +30,3 @@ obj-$(CONFIG_ARCH_U8500) += ux500/ obj-$(CONFIG_PLAT_VERSATILE) += versatile/ obj-y += xilinx/ obj-$(CONFIG_ARCH_ZX) += zte/ -obj-$(CONFIG_SOC_CANAAN) += canaan/ From 0f02de4481da684aad6589aed0ea47bd1ab391c9 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 21 Feb 2021 09:22:33 -0500 Subject: [PATCH 265/506] riscv: Get rid of MAX_EARLY_MAPPING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At early boot stage, we have a whole PGDIR to map the kernel, so there is no need to restrict the early mapping size to 128MB. Removing this define also allows us to simplify some compile time logic. This fixes large kernel mappings with a size greater than 128MB, as it is the case for syzbot kernels whose size was just ~130MB. Note that on rv64, for now, we are then limited to PGDIR size for early mapping as we can't use PGD mappings (see [1]). That should be enough given the relative small size of syzbot kernels compared to PGDIR_SIZE which is 1GB. [1] https://lore.kernel.org/lkml/20200603153608.30056-1-alex@ghiti.fr/ Reported-by: Dmitry Vyukov Signed-off-by: Alexandre Ghiti Tested-by: Dmitry Vyukov Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c9bbab76358b..3e044f11cef6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -165,8 +165,6 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; -#define MAX_EARLY_MAPPING_SIZE SZ_128M - pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) @@ -241,13 +239,7 @@ static void __init create_pte_mapping(pte_t *ptep, pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss; pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; - -#if MAX_EARLY_MAPPING_SIZE < PGDIR_SIZE -#define NUM_EARLY_PMDS 1UL -#else -#define NUM_EARLY_PMDS (1UL + MAX_EARLY_MAPPING_SIZE / PGDIR_SIZE) -#endif -pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE); +pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) @@ -269,11 +261,9 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - uintptr_t pmd_num; + BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT); - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; - BUG_ON(pmd_num >= NUM_EARLY_PMDS); - return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; + return (uintptr_t)early_pmd; } static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) @@ -391,7 +381,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) uintptr_t va, pa, end_va; uintptr_t load_pa = (uintptr_t)(&_start); uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; - uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); + uintptr_t map_size; #ifndef __PAGETABLE_PMD_FOLDED pmd_t fix_bmap_spmd, fix_bmap_epmd; #endif @@ -403,12 +393,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * Enforce boot alignment requirements of RV32 and * RV64 by only allowing PMD or PGD mappings. */ - BUG_ON(map_size == PAGE_SIZE); + map_size = PMD_SIZE; /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((load_pa % map_size) != 0); - BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE); pt_ops.alloc_pte = alloc_pte_early; pt_ops.get_pte_virt = get_pte_virt_early; From 0a8a800027f124845c3ce0b5c3dfed6f268b13bb Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Thu, 18 Feb 2021 14:42:03 +0200 Subject: [PATCH 266/506] net: mvpp2: skip RSS configurations on loopback port PPv2 loopback port doesn't support RSS, so we should skip RSS configurations for this port. Signed-off-by: Stefan Chulski Reviewed-by: Marcin Wojtas Link: https://lore.kernel.org/r/1613652123-19021-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 0507369bb54d..1767c60056c5 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4699,9 +4699,10 @@ static void mvpp2_irqs_deinit(struct mvpp2_port *port) } } -static bool mvpp22_rss_is_supported(void) +static bool mvpp22_rss_is_supported(struct mvpp2_port *port) { - return queue_mode == MVPP2_QDIST_MULTI_MODE; + return (queue_mode == MVPP2_QDIST_MULTI_MODE) && + !(port->flags & MVPP2_F_LOOPBACK); } static int mvpp2_open(struct net_device *dev) @@ -5513,7 +5514,7 @@ static int mvpp2_ethtool_get_rxnfc(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret = 0, i, loc = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; switch (info->cmd) { @@ -5548,7 +5549,7 @@ static int mvpp2_ethtool_set_rxnfc(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; switch (info->cmd) { @@ -5569,7 +5570,9 @@ static int mvpp2_ethtool_set_rxnfc(struct net_device *dev, static u32 mvpp2_ethtool_get_rxfh_indir_size(struct net_device *dev) { - return mvpp22_rss_is_supported() ? MVPP22_RSS_TABLE_ENTRIES : 0; + struct mvpp2_port *port = netdev_priv(dev); + + return mvpp22_rss_is_supported(port) ? MVPP22_RSS_TABLE_ENTRIES : 0; } static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, @@ -5578,7 +5581,7 @@ static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (indir) @@ -5596,7 +5599,7 @@ static int mvpp2_ethtool_set_rxfh(struct net_device *dev, const u32 *indir, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32) @@ -5617,7 +5620,7 @@ static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev, u32 *indir, struct mvpp2_port *port = netdev_priv(dev); int ret = 0; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (rss_context >= MVPP22_N_RSS_TABLES) return -EINVAL; @@ -5639,7 +5642,7 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev, struct mvpp2_port *port = netdev_priv(dev); int ret; - if (!mvpp22_rss_is_supported()) + if (!mvpp22_rss_is_supported(port)) return -EOPNOTSUPP; if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32) @@ -5956,7 +5959,7 @@ static int mvpp2_port_init(struct mvpp2_port *port) mvpp2_cls_oversize_rxq_set(port); mvpp2_cls_port_config(port); - if (mvpp22_rss_is_supported()) + if (mvpp22_rss_is_supported(port)) mvpp22_port_rss_init(port); /* Provide an initial Rx packet size */ @@ -6861,7 +6864,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO | NETIF_F_HW_VLAN_CTAG_FILTER; - if (mvpp22_rss_is_supported()) { + if (mvpp22_rss_is_supported(port)) { dev->hw_features |= NETIF_F_RXHASH; dev->features |= NETIF_F_NTUPLE; } From 7899ed260c348fe8813150c12c7a59fa0e3e9121 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 8 Feb 2021 14:30:14 -0500 Subject: [PATCH 267/506] riscv: Improve kasan definitions There is no functional change here, only improvement in code readability by adding comments to explain where the kasan constants come from and by replacing hardcoded numerical constant by the corresponding define. Note that the comments come from arm64. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kasan.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index b04028c6218c..a2b3d9cdbc86 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -8,12 +8,28 @@ #ifdef CONFIG_KASAN +/* + * The following comment was copied from arm64: + * KASAN_SHADOW_START: beginning of the kernel virtual addresses. + * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/N of kernel virtual addresses, + * where N = (1 << KASAN_SHADOW_SCALE_SHIFT). + * + * KASAN_SHADOW_OFFSET: + * This value is used to map an address to the corresponding shadow + * address by the following formula: + * shadow_addr = (address >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET + * + * (1 << (64 - KASAN_SHADOW_SCALE_SHIFT)) shadow addresses that lie in range + * [KASAN_SHADOW_OFFSET, KASAN_SHADOW_END) cover all 64-bits of virtual + * addresses. So KASAN_SHADOW_OFFSET should satisfy the following equation: + * KASAN_SHADOW_OFFSET = KASAN_SHADOW_END - + * (1ULL << (64 - KASAN_SHADOW_SCALE_SHIFT)) + */ #define KASAN_SHADOW_SCALE_SHIFT 3 -#define KASAN_SHADOW_SIZE (UL(1) << (38 - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START KERN_VIRT_START /* 2^64 - 2^38 */ +#define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) +#define KASAN_SHADOW_START KERN_VIRT_START #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) - #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << \ (64 - KASAN_SHADOW_SCALE_SHIFT))) From 9484e2aef45bbc27cd23519917f27031e2857a6f Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 8 Feb 2021 14:30:15 -0500 Subject: [PATCH 268/506] riscv: Use KASAN_SHADOW_INIT define for kasan memory initialization Instead of hardcoding memory initialization to 0, use KASAN_SHADOW_INIT. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 4b9149f963d3..8f23799fcbe6 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -93,7 +93,7 @@ static void __init populate(void *start, void *end) __pgprot(_PAGE_TABLE))); local_flush_tlb_all(); - memset(start, 0, end - start); + memset(start, KASAN_SHADOW_INIT, end - start); } void __init kasan_shallow_populate(void *start, void *end) @@ -163,6 +163,6 @@ void __init kasan_init(void) __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_ACCESSED))); - memset(kasan_early_shadow_page, 0, PAGE_SIZE); + memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); init_task.kasan_depth = 0; } From d127c19c7bea6150a247ffcd529c9a176877e422 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 8 Feb 2021 14:30:16 -0500 Subject: [PATCH 269/506] riscv: Improve kasan population function Current population code populates a whole page table without taking care of what could have been already allocated and without taking into account possible index in page table, assuming the virtual address to map is always aligned on the page table size, which, for example, won't be the case when the kernel will get pushed to the end of the address space. Address those problems by rewriting the kasan population function, splitting it into subfunctions for each different page table level. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 91 ++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 28 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 8f23799fcbe6..a97df4fdce09 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -60,37 +60,72 @@ asmlinkage void __init kasan_early_init(void) local_flush_tlb_all(); } -static void __init populate(void *start, void *end) +static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pte_t *ptep, *base_pte; + + if (pmd_none(*pmd)) + base_pte = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + else + base_pte = (pte_t *)pmd_page_vaddr(*pmd); + + ptep = base_pte + pte_index(vaddr); + + do { + if (pte_none(*ptep)) { + phys_addr = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); + set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL)); + } + } while (ptep++, vaddr += PAGE_SIZE, vaddr != end); + + set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); +} + +static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pmd_t *pmdp, *base_pmd; + unsigned long next; + + base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + + pmdp = base_pmd + pmd_index(vaddr); + + do { + next = pmd_addr_end(vaddr, end); + kasan_populate_pte(pmdp, vaddr, next); + } while (pmdp++, vaddr = next, vaddr != end); + + /* + * Wait for the whole PGD to be populated before setting the PGD in + * the page table, otherwise, if we did set the PGD before populating + * it entirely, memblock could allocate a page at a physical address + * where KASAN is not populated yet and then we'd get a page fault. + */ + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); +} + +static void kasan_populate_pgd(unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pgd_t *pgdp = pgd_offset_k(vaddr); + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + kasan_populate_pmd(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_populate(void *start, void *end) { - unsigned long i, offset; unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - unsigned long n_pages = (vend - vaddr) / PAGE_SIZE; - unsigned long n_ptes = - ((n_pages + PTRS_PER_PTE) & -PTRS_PER_PTE) / PTRS_PER_PTE; - unsigned long n_pmds = - ((n_ptes + PTRS_PER_PMD) & -PTRS_PER_PMD) / PTRS_PER_PMD; - pte_t *pte = - memblock_alloc(n_ptes * PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); - pmd_t *pmd = - memblock_alloc(n_pmds * PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); - pgd_t *pgd = pgd_offset_k(vaddr); - - for (i = 0; i < n_pages; i++) { - phys_addr_t phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); - set_pte(&pte[i], pfn_pte(PHYS_PFN(phys), PAGE_KERNEL)); - } - - for (i = 0, offset = 0; i < n_ptes; i++, offset += PTRS_PER_PTE) - set_pmd(&pmd[i], - pfn_pmd(PFN_DOWN(__pa(&pte[offset])), - __pgprot(_PAGE_TABLE))); - - for (i = 0, offset = 0; i < n_pmds; i++, offset += PTRS_PER_PMD) - set_pgd(&pgd[i], - pfn_pgd(PFN_DOWN(__pa(&pmd[offset])), - __pgprot(_PAGE_TABLE))); + kasan_populate_pgd(vaddr, vend); local_flush_tlb_all(); memset(start, KASAN_SHADOW_INIT, end - start); @@ -154,7 +189,7 @@ void __init kasan_init(void) if (start >= end) break; - populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); + kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); }; for (i = 0; i < PTRS_PER_PTE; i++) From 4e9d9d1f4880ad358a8e5eb6ac4c811fd76dd617 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Feb 2021 13:10:44 +0300 Subject: [PATCH 270/506] net: phy: icplus: call phy_restore_page() when phy_select_page() fails The comments to phy_select_page() say that "phy_restore_page() must always be called after this, irrespective of success or failure of this call." If we don't call phy_restore_page() then we are still holding the phy_lock_mdio_bus() so it eventually leads to a dead lock. Fixes: 32ab60e53920 ("net: phy: icplus: add MDI/MDIX support for IP101A/G") Fixes: f9bc51e6cce2 ("net: phy: icplus: fix paged register access") Signed-off-by: Dan Carpenter Reviewed-by: Michael Walle Reviewed-by: Russell King Link: https://lore.kernel.org/r/YC+OpFGsDPXPnXM5@mwanda Signed-off-by: Jakub Kicinski --- drivers/net/phy/icplus.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index 3e431737c1ba..a00a667454a9 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -239,7 +239,7 @@ static int ip101a_g_config_intr_pin(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; /* configure the RXER/INTR_32 pin of the 32-pin IP101GR if needed: */ switch (priv->sel_intr32) { @@ -314,7 +314,7 @@ static int ip101a_g_read_status(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; ret = __phy_read(phydev, IP10XX_SPEC_CTRL_STATUS); if (ret < 0) @@ -349,7 +349,8 @@ static int ip101a_g_read_status(struct phy_device *phydev) static int ip101a_g_config_mdix(struct phy_device *phydev) { u16 ctrl = 0, ctrl2 = 0; - int oldpage, ret; + int oldpage; + int ret = 0; switch (phydev->mdix_ctrl) { case ETH_TP_MDI: @@ -367,7 +368,7 @@ static int ip101a_g_config_mdix(struct phy_device *phydev) oldpage = phy_select_page(phydev, IP101G_DEFAULT_PAGE); if (oldpage < 0) - return oldpage; + goto out; ret = __phy_modify(phydev, IP10XX_SPEC_CTRL_STATUS, IP101A_G_AUTO_MDIX_DIS, ctrl); From 94ead4caa0615f4b0719ffcb4dbd0907fe2f9265 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Feb 2021 21:12:21 -0800 Subject: [PATCH 271/506] net: dsa: Fix dependencies with HSR The core DSA framework uses hsr_is_master() which would not resolve to a valid symbol if HSR is built-into the kernel and DSA is a module. Fixes: 18596f504a3e ("net: dsa: add support for offloading HSR") Reported-by: kernel test robot Signed-off-by: Florian Fainelli Reviewed-by: George McCollister Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210220051222.15672-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- net/dsa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index a45572cfb71a..3589224c8da9 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + depends on HSR || HSR=n select GRO_CELLS select NET_SWITCHDEV select PHYLINK From d7fbcf40df86bb67193d9faf52138fc1202decb2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 8 Feb 2021 14:30:17 -0500 Subject: [PATCH 272/506] riscv: Improve kasan population by using hugepages when possible The kasan functions that populates the shadow regions used to allocate them page by page and did not take advantage of hugepages, so fix this by trying to allocate hugepages of 1GB and fallback to 2MB hugepages or 4K pages in case it fails. This reduces the page table memory consumption and improves TLB usage, as shown below: Before this patch: ---[ Kasan shadow start ]--- 0xffffffc000000000-0xffffffc400000000 0x00000000818ef000 16G PTE . A . . . . R V 0xffffffc400000000-0xffffffc447fc0000 0x00000002b7f4f000 1179392K PTE D A . . . W R V 0xffffffc480000000-0xffffffc800000000 0x00000000818ef000 14G PTE . A . . . . R V ---[ Kasan shadow end ]--- After this patch: ---[ Kasan shadow start ]--- 0xffffffc000000000-0xffffffc400000000 0x00000000818ef000 16G PTE . A . . . . R V 0xffffffc400000000-0xffffffc440000000 0x0000000240000000 1G PGD D A . . . W R V 0xffffffc440000000-0xffffffc447e00000 0x00000002b7e00000 126M PMD D A . . . W R V 0xffffffc447e00000-0xffffffc447fc0000 0x00000002b818f000 1792K PTE D A . . . W R V 0xffffffc480000000-0xffffffc800000000 0x00000000818ef000 14G PTE . A . . . . R V ---[ Kasan shadow end ]--- Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index a97df4fdce09..c676a4674342 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -96,6 +96,15 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en do { next = pmd_addr_end(vaddr, end); + + if (pmd_none(*pmdp) && IS_ALIGNED(vaddr, PMD_SIZE) && (next - vaddr) >= PMD_SIZE) { + phys_addr = memblock_phys_alloc(PMD_SIZE, PMD_SIZE); + if (phys_addr) { + set_pmd(pmdp, pfn_pmd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + kasan_populate_pte(pmdp, vaddr, next); } while (pmdp++, vaddr = next, vaddr != end); @@ -116,6 +125,21 @@ static void kasan_populate_pgd(unsigned long vaddr, unsigned long end) do { next = pgd_addr_end(vaddr, end); + + /* + * pgdp can't be none since kasan_early_init initialized all KASAN + * shadow region with kasan_early_shadow_pmd: if this is stillthe case, + * that means we can try to allocate a hugepage as a replacement. + */ + if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pmd) && + IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + kasan_populate_pmd(pgdp, vaddr, next); } while (pgdp++, vaddr = next, vaddr != end); } From 341c65242fe18aac8900e4291d472df9f7ba7bc7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:37 +0100 Subject: [PATCH 273/506] mptcp: fix DATA_FIN processing for orphaned sockets Currently we move orphaned msk sockets directly from FIN_WAIT2 state to CLOSE, with the rationale that incoming additional data could be just dropped by the TCP stack/TW sockets. Anyhow we miss sending MPTCP-level ack on incoming DATA_FIN, and that may hang the peers. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a57f3eab7b6a..b1075fc3d3b3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2264,13 +2264,12 @@ static void mptcp_worker(struct work_struct *work) __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); - /* if the msk data is completely acked, or the socket timedout, - * there is no point in keeping around an orphaned sk + /* There is no point in keeping around an orphaned sk timedout or + * closed, but we need the msk around to reply to incoming DATA_FIN, + * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || - (state != sk->sk_state && - ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { + (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { inet_sk_state_store(sk, TCP_CLOSE); __mptcp_destroy_sock(sk); goto unlock; From d87903b63e3ce1eafaa701aec5cc1d0ecd0d84dc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:38 +0100 Subject: [PATCH 274/506] mptcp: fix DATA_FIN generation on early shutdown If the msk is closed before sending or receiving any data, no DATA_FIN is generated, instead an MPC ack packet is crafted out. In the above scenario, the MPTCP protocol creates and sends a pure ack and such packets matches also the criteria for an MPC ack and the protocol tries first to insert MPC options, leading to the described error. This change addresses the issue by avoiding the insertion of an MPC option for DATA_FIN packets or if the sub-flow is not established. To avoid doing multiple times the same test, fetch the data_fin flag in a bool variable and pass it to both the interested helpers. Fixes: 6d0060f600ad ("mptcp: Write MPTCP DSS headers to outgoing data packets") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b63574d6b812..444a38681e93 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -411,6 +411,7 @@ static void clear_3rdack_retransmission(struct sock *sk) } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -428,9 +429,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, if (!skb) return false; - /* MPC/MPJ needed only on 3rd ack packet */ - if (subflow->fully_established || - subflow->snd_isn != TCP_SKB_CB(skb)->seq) + /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ + if (subflow->fully_established || snd_data_fin_enable || + subflow->snd_isn != TCP_SKB_CB(skb)->seq || + sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { @@ -502,20 +504,20 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + bool snd_data_fin_enable, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - u64 snd_data_fin_enable, ack_seq; unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; + u64 ack_seq; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -717,12 +719,15 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; + bool snd_data_fin; bool ret = false; opts->suboptions = 0; - if (unlikely(mptcp_check_fallback(sk))) + if (unlikely(__mptcp_check_fallback(msk))) return false; /* prevent adding of any MPTCP related options on reset packet @@ -731,10 +736,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) return false; - if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + snd_data_fin = mptcp_data_fin_enabled(msk); + if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, - opts)) + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; /* we reserved enough space for the above options, and exceeding the From ad98dd37051e14fa8c785609430d907fcfd518ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 19 Feb 2021 18:35:39 +0100 Subject: [PATCH 275/506] mptcp: provide subflow aware release function mptcp re-used inet(6)_release, so the subflow sockets are ignored. Need to invoke ip(v6)_mc_drop_socket function to ensure mcast join resources get free'd. Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/110 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 55 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b1075fc3d3b3..c5d5e68940ea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include +#include #endif #include #include @@ -3374,10 +3376,34 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static int mptcp_release(struct socket *sock) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = sock->sk; + struct mptcp_sock *msk; + + if (!sk) + return 0; + + lock_sock(sk); + + msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ip_mc_drop_socket(ssk); + } + + release_sock(sk); + + return inet_release(sock); +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, - .release = inet_release, + .release = mptcp_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, @@ -3469,10 +3495,35 @@ void __init mptcp_proto_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp6_release(struct socket *sock) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + lock_sock(sk); + + msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ip_mc_drop_socket(ssk); + ipv6_sock_mc_close(ssk); + ipv6_sock_ac_close(ssk); + } + + release_sock(sk); + return inet6_release(sock); +} + static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, - .release = inet6_release, + .release = mptcp6_release, .bind = mptcp_bind, .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, From 52557dbc7538ecceb27ef2206719a47a8039a335 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 19 Feb 2021 18:35:40 +0100 Subject: [PATCH 276/506] mptcp: do not wakeup listener for MPJ subflows MPJ subflows are not exposed as fds to user spaces. As such, incoming MPJ subflows are removed from the accept queue by tcp_check_req()/tcp_get_cookie_sock(). Later tcp_child_process() invokes subflow_data_ready() on the parent socket regardless of the subflow kind, leading to poll wakeups even if the later accept will block. Address the issue by double-checking the queue state before waking the user-space. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/164 Reported-by: Dr. David Alan Gilbert Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 06e233410e0e..e1fbcab257e6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1096,6 +1096,12 @@ static void subflow_data_ready(struct sock *sk) msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { + /* MPJ subflow are removed from accept queue before reaching here, + * avoid stray wakeups + */ + if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) + return; + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; From 24877687b375f2c476ffb726ea915fc85df09e3d Mon Sep 17 00:00:00 2001 From: "Song, Yoong Siang" Date: Thu, 18 Feb 2021 21:40:53 +0800 Subject: [PATCH 277/506] net: stmmac: fix CBS idleslope and sendslope calculation When link speed is not 100 Mbps, port transmit rate and speed divider are set to 8 and 1000000 respectively. These values are incorrect for CBS idleslope and sendslope HW values calculation if the link speed is not 1 Gbps. This patch adds switch statement to set the values of port transmit rate and speed divider for 10 Gbps, 5 Gbps, 2.5 Gbps, 1 Gbps, and 100 Mbps. Note that CBS is not supported at 10 Mbps. Fixes: bc41a6689b30 ("net: stmmac: tc: Remove the speed dependency") Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC") Signed-off-by: Song, Yoong Siang Link: https://lore.kernel.org/r/1613655653-11755-1-git-send-email-yoong.siang.song@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_tc.c | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 56985542e202..44bb133c3000 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -316,6 +316,32 @@ static int tc_setup_cbs(struct stmmac_priv *priv, if (!priv->dma_cap.av) return -EOPNOTSUPP; + /* Port Transmit Rate and Speed Divider */ + switch (priv->speed) { + case SPEED_10000: + ptr = 32; + speed_div = 10000000; + break; + case SPEED_5000: + ptr = 32; + speed_div = 5000000; + break; + case SPEED_2500: + ptr = 8; + speed_div = 2500000; + break; + case SPEED_1000: + ptr = 8; + speed_div = 1000000; + break; + case SPEED_100: + ptr = 4; + speed_div = 100000; + break; + default: + return -EOPNOTSUPP; + } + mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use; if (mode_to_use == MTL_QUEUE_DCB && qopt->enable) { ret = stmmac_dma_qmode(priv, priv->ioaddr, queue, MTL_QUEUE_AVB); @@ -332,10 +358,6 @@ static int tc_setup_cbs(struct stmmac_priv *priv, priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB; } - /* Port Transmit Rate and Speed Divider */ - ptr = (priv->speed == SPEED_100) ? 4 : 8; - speed_div = (priv->speed == SPEED_100) ? 100000 : 1000000; - /* Final adjustments for HW */ value = div_s64(qopt->idleslope * 1024ll * ptr, speed_div); priv->plat->tx_queues_cfg[queue].idle_slope = value & GENMASK(31, 0); From 8eb65fda4a6dbd59cd5de24b106a10b6ee0d2176 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Sun, 21 Feb 2021 22:35:59 +0800 Subject: [PATCH 278/506] net/mlx4_core: Add missed mlx4_free_cmd_mailbox() mlx4_do_mirror_rule() forgets to call mlx4_free_cmd_mailbox() to free the memory region allocated by mlx4_alloc_cmd_mailbox() before an exit. Add the missed call to fix it. Fixes: 78efed275117 ("net/mlx4_core: Support mirroring VF DMFS rules on both ports") Signed-off-by: Chuhong Yuan Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20210221143559.390277-1-hslester96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 394f43add85c..a99e71bc7b3c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -4986,6 +4986,7 @@ static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule if (!fs_rule->mirr_mbox) { mlx4_err(dev, "rule mirroring mailbox is null\n"); + mlx4_free_cmd_mailbox(dev, mailbox); return -EINVAL; } memcpy(mailbox->buf, fs_rule->mirr_mbox, fs_rule->mirr_mbox_size); From d47422d953e258ad587b5edf2274eb95d08bdc7d Mon Sep 17 00:00:00 2001 From: He Zhe Date: Tue, 23 Feb 2021 16:25:34 +0800 Subject: [PATCH 279/506] arm64: uprobe: Return EOPNOTSUPP for AARCH32 instruction probing As stated in linux/errno.h, ENOTSUPP should never be seen by user programs. When we set up uprobe with 32-bit perf and arm64 kernel, we would see the following vague error without useful hint. The sys_perf_event_open() syscall returned with 524 (INTERNAL ERROR: strerror_r(524, [buf], 128)=22) Use EOPNOTSUPP instead to indicate such cases. Signed-off-by: He Zhe Link: https://lore.kernel.org/r/20210223082535.48730-1-zhe.he@windriver.com Cc: Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index a412d8edbcd2..2c247634552b 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -38,7 +38,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) - return -ENOTSUPP; + return -EOPNOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; From 2e8acca1911b14e0cc7464db796b804785a3831a Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Mon, 22 Feb 2021 09:43:51 +0800 Subject: [PATCH 280/506] arm64/mm: Fixed some coding style issues Adjust whitespace for fixmap_pXd() functions returning pointers for consistency with the kernel coding style. Signed-off-by: Zhiyuan Dai Link: https://lore.kernel.org/r/1613958231-5474-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 25af183e4bed..9ce8641941fb 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1155,7 +1155,7 @@ void vmemmap_free(unsigned long start, unsigned long end, } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static inline pud_t * fixmap_pud(unsigned long addr) +static inline pud_t *fixmap_pud(unsigned long addr) { pgd_t *pgdp = pgd_offset_k(addr); p4d_t *p4dp = p4d_offset(pgdp, addr); @@ -1166,7 +1166,7 @@ static inline pud_t * fixmap_pud(unsigned long addr) return pud_offset_kimg(p4dp, addr); } -static inline pmd_t * fixmap_pmd(unsigned long addr) +static inline pmd_t *fixmap_pmd(unsigned long addr) { pud_t *pudp = fixmap_pud(addr); pud_t pud = READ_ONCE(*pudp); @@ -1176,7 +1176,7 @@ static inline pmd_t * fixmap_pmd(unsigned long addr) return pmd_offset_kimg(pudp, addr); } -static inline pte_t * fixmap_pte(unsigned long addr) +static inline pte_t *fixmap_pte(unsigned long addr) { return &bm_pte[pte_index(addr)]; } From 49387f628840eac1e7e1113f4f2c150cdecf88c7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Feb 2021 11:36:21 +0000 Subject: [PATCH 281/506] vmlinux.lds.h: catch even more instrumentation symbols into .data LKP caught another bunch of orphaned instrumentation symbols [0]: mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from `init/main.o' being placed in section `.data.$LPBX1' mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from `init/main.o' being placed in section `.data.$LPBX0' mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from `init/do_mounts.o' being placed in section `.data.$LPBX1' mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from `init/do_mounts.o' being placed in section `.data.$LPBX0' mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from `init/do_mounts_initrd.o' being placed in section `.data.$LPBX1' mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from `init/do_mounts_initrd.o' being placed in section `.data.$LPBX0' mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from `init/initramfs.o' being placed in section `.data.$LPBX1' mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from `init/initramfs.o' being placed in section `.data.$LPBX0' mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from `init/calibrate.o' being placed in section `.data.$LPBX1' mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from `init/calibrate.o' being placed in section `.data.$LPBX0' [...] Soften the wildcard to .data.$L* to grab these ones into .data too. [0] https://lore.kernel.org/lkml/202102231519.lWPLPveV-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Alexander Lobakin Signed-off-by: Thomas Bogendoerfer --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 83537e5ee78f..ecfb54e9fda5 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -95,7 +95,7 @@ */ #ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION #define TEXT_MAIN .text .text.[0-9a-zA-Z_]* -#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$Lubsan_* +#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]* #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L* #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..compoundliteral* From faf3c25e51a7e91b69ea26da72c74a8786af7968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Mon, 22 Feb 2021 21:33:50 +0100 Subject: [PATCH 282/506] mips: bmips: init clocks earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit device_initcall() is too late for bcm63xx. We need to call of_clk_init() earlier in order to properly boot. Signed-off-by: Álvaro Fernández Rojas Signed-off-by: Thomas Bogendoerfer --- arch/mips/bmips/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index 95f8f10d8697..31bcfa4e08b9 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -196,4 +196,4 @@ static int __init plat_dev_init(void) return 0; } -device_initcall(plat_dev_init); +arch_initcall(plat_dev_init); From 057a14d610cfd671df9c9044224f34e553cd7041 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Feb 2021 17:19:03 +0100 Subject: [PATCH 283/506] arch: mips: update references to current linux-mips list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linux-mips mailing list now lives at kernel.org. Update all references in the kernel tree. Signed-off-by: Lukas Bulwahn Reviewed-by: Huacai Chen Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/r4k-bugs64.c | 2 +- arch/mips/lib/iomap-pci.c | 2 +- arch/mips/sgi-ip32/ip32-irq.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kernel/r4k-bugs64.c b/arch/mips/kernel/r4k-bugs64.c index 1ff19f1ea5ca..35729c9e6cfa 100644 --- a/arch/mips/kernel/r4k-bugs64.c +++ b/arch/mips/kernel/r4k-bugs64.c @@ -18,7 +18,7 @@ static char bug64hit[] __initdata = "reliable operation impossible!\n%s"; static char nowar[] __initdata = - "Please report to ."; + "Please report to ."; static char r4kwar[] __initdata = "Enable CPU_R4000_WORKAROUNDS to rectify."; static char daddiwar[] __initdata = diff --git a/arch/mips/lib/iomap-pci.c b/arch/mips/lib/iomap-pci.c index 210f5a95ecb1..a9cb28813f0b 100644 --- a/arch/mips/lib/iomap-pci.c +++ b/arch/mips/lib/iomap-pci.c @@ -32,7 +32,7 @@ void __iomem *__pci_ioport_map(struct pci_dev *dev, sprintf(name, "%04x:%02x", pci_domain_nr(bus), bus->number); printk(KERN_WARNING "io_map_base of root PCI bus %s unset. " "Trying to continue but you better\nfix this issue or " - "report it to linux-mips@linux-mips.org or your " + "report it to linux-mips@vger.kernel.org or your " "vendor.\n", name); #ifdef CONFIG_PCI_DOMAINS panic("To avoid data corruption io_map_base MUST be set with " diff --git a/arch/mips/sgi-ip32/ip32-irq.c b/arch/mips/sgi-ip32/ip32-irq.c index 1bbd5bfb5458..e21ea1de05e3 100644 --- a/arch/mips/sgi-ip32/ip32-irq.c +++ b/arch/mips/sgi-ip32/ip32-irq.c @@ -343,7 +343,7 @@ static void ip32_unknown_interrupt(void) printk("Register dump:\n"); show_regs(get_irq_regs()); - printk("Please mail this report to linux-mips@linux-mips.org\n"); + printk("Please mail this report to linux-mips@vger.kernel.org\n"); printk("Spinning..."); while(1) ; } From 02cc6b495dd694484167a841d7ede4b6209c658f Mon Sep 17 00:00:00 2001 From: Jiapeng Zhong Date: Wed, 20 Jan 2021 15:50:31 +0800 Subject: [PATCH 284/506] virtio-mem: Assign boolean values to a bool variable Fix the following coccicheck warnings: ./drivers/virtio/virtio_mem.c:2580:2-25: WARNING: Assignment of 0/1 to bool variable. Reported-by: Abaci Robot Signed-off-by: Jiapeng Zhong Link: https://lore.kernel.org/r/1611129031-82818-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Tian Tao Acked-by: David Hildenbrand Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 9fc9ec4a25f5..85a272c9978e 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2577,7 +2577,7 @@ static int virtio_mem_probe(struct virtio_device *vdev) * actually in use (e.g., trying to reload the driver). */ if (vm->plugged_size) { - vm->unplug_all_required = 1; + vm->unplug_all_required = true; dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); } From 437944126004d531ccac113db57985a713fc366d Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:31:58 +0200 Subject: [PATCH 285/506] vdpa_sim_net: Make mac address array static MAC address array is used only in vdpa_sim_net.c. Hence, keep it static. Signed-off-by: Parav Pandit Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index c10b6981fdab..f0482427186b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -33,7 +33,7 @@ static char *macaddr; module_param(macaddr, charp, 0); MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); -u8 macaddr_buf[ETH_ALEN]; +static u8 macaddr_buf[ETH_ALEN]; static struct vdpasim *vdpasim_net_dev; From fd70a406a344e084ac680c3f14e71d37d6023883 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:31:59 +0200 Subject: [PATCH 286/506] vdpa: Extend routine to accept vdpa device name In a subsequent patch, when user initiated command creates a vdpa device, the user chooses the name of the vdpa device. To support it, extend the device allocation API to consider this name specified by the caller driver. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-3-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/vdpa.c | 36 +++++++++++++++++++++++++++---- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- include/linux/vdpa.h | 7 +++--- 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index fa1af301cf55..7c8bbfcf6c3e 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -432,7 +432,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, dev, &ifc_vdpa_ops, - IFCVF_MAX_QUEUE_PAIRS * 2); + IFCVF_MAX_QUEUE_PAIRS * 2, NULL); if (adapter == NULL) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return -ENOMEM; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index b5fe6d2ad22f..dc88559a8d49 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1982,7 +1982,7 @@ static int mlx5v_probe(struct auxiliary_device *adev, max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - 2 * mlx5_vdpa_max_qps(max_vqs)); + 2 * mlx5_vdpa_max_qps(max_vqs), NULL); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index c0825650c055..7414bbd9057c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -12,6 +12,8 @@ #include #include +/* A global mutex that protects vdpa management device and device level operations. */ +static DEFINE_MUTEX(vdpa_dev_mutex); static DEFINE_IDA(vdpa_index_ida); static int vdpa_dev_probe(struct device *d) @@ -63,6 +65,7 @@ static void vdpa_release_dev(struct device *d) * @config: the bus operations that is supported by this device * @nvqs: number of virtqueues supported by this device * @size: size of the parent structure that contains private data + * @name: name of the vdpa device; optional. * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. @@ -72,8 +75,7 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - int nvqs, - size_t size) + int nvqs, size_t size, const char *name) { struct vdpa_device *vdev; int err = -EINVAL; @@ -101,7 +103,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->features_valid = false; vdev->nvqs = nvqs; - err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); + if (name) + err = dev_set_name(&vdev->dev, "%s", name); + else + err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; @@ -118,6 +123,13 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); +static int vdpa_name_match(struct device *dev, const void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + + return (strcmp(dev_name(&vdev->dev), data) == 0); +} + /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. @@ -127,7 +139,21 @@ EXPORT_SYMBOL_GPL(__vdpa_alloc_device); */ int vdpa_register_device(struct vdpa_device *vdev) { - return device_add(&vdev->dev); + struct device *dev; + int err; + + mutex_lock(&vdpa_dev_mutex); + dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); + if (dev) { + put_device(dev); + err = -EEXIST; + goto name_err; + } + + err = device_add(&vdev->dev); +name_err: + mutex_unlock(&vdpa_dev_mutex); + return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); @@ -137,7 +163,9 @@ EXPORT_SYMBOL_GPL(vdpa_register_device); */ void vdpa_unregister_device(struct vdpa_device *vdev) { + mutex_lock(&vdpa_dev_mutex); device_unregister(&vdev->dev); + mutex_unlock(&vdpa_dev_mutex); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index b3fcc67bfdf0..db1636a99ba4 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - dev_attr->nvqs); + dev_attr->nvqs, NULL); if (!vdpasim) goto err_alloc; diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 0fefeb976877..5700baa22356 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -245,15 +245,14 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - int nvqs, - size_t size); + int nvqs, size_t size, const char *name); -#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs, name) \ container_of(__vdpa_alloc_device( \ parent, config, nvqs, \ sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member))), \ + dev_struct, member)), name), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev); From 33b347503f014ebf76257327cbc7001c6b721956 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:00 +0200 Subject: [PATCH 287/506] vdpa: Define vdpa mgmt device, ops and a netlink interface To add one or more VDPA devices, define a management device which allows adding or removing vdpa device. A management device defines set of callbacks to manage vdpa devices. To begin with, it defines add and remove callbacks through which a user defined vdpa device can be added or removed. A unique management device is identified by its unique handle identified by management device name and optionally the bus name. Hence, introduce routine through which driver can register a management device and its callback operations for adding and remove a vdpa device. Introduce vdpa netlink socket family so that user can query management device and its attributes. Example of show vdpa management device which allows creating vdpa device of networking class (device id = 0x1) of virtio specification 1.1 section 5.1.1. $ vdpa mgmtdev show vdpasim_net: supported_classes: net Example of showing vdpa management device in JSON format. $ vdpa mgmtdev show -jp { "show": { "vdpasim_net": { "supported_classes": [ "net" ] } } } Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Including a bugfix: vpda: correctly size vdpa_nl_policy We need to ensure last entry of vdpa_nl_policy[] is zero, otherwise out-of-bounds access is hurting us. Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Parav Pandit Cc: Eli Cohen Cc: Jason Wang Cc: Michael S. Tsirkin Link: https://lore.kernel.org/r/20210210134911.4119555-1-eric.dumazet@gmail.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/Kconfig | 1 + drivers/vdpa/vdpa.c | 213 +++++++++++++++++++++++++++++++++++++- include/linux/vdpa.h | 31 ++++++ include/uapi/linux/vdpa.h | 31 ++++++ 4 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/vdpa.h diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 92a6396f8a73..ffd1e098bfd2 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig VDPA tristate "vDPA drivers" + depends on NET help Enable this module to support vDPA device that uses a datapath which complies with virtio specifications with diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 7414bbd9057c..586e66b7f671 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -11,11 +11,17 @@ #include #include #include +#include +#include +#include +static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DEFINE_MUTEX(vdpa_dev_mutex); static DEFINE_IDA(vdpa_index_ida); +static struct genl_family vdpa_nl_family; + static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); @@ -195,13 +201,218 @@ void vdpa_unregister_driver(struct vdpa_driver *drv) } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); +/** + * vdpa_mgmtdev_register - register a vdpa management device + * + * @mdev: Pointer to vdpa management device + * vdpa_mgmtdev_register() register a vdpa management device which supports + * vdpa device management. + */ +int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) +{ + if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) + return -EINVAL; + + INIT_LIST_HEAD(&mdev->list); + mutex_lock(&vdpa_dev_mutex); + list_add_tail(&mdev->list, &mdev_head); + mutex_unlock(&vdpa_dev_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); + +void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) +{ + mutex_lock(&vdpa_dev_mutex); + list_del(&mdev->list); + mutex_unlock(&vdpa_dev_mutex); +} +EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); + +static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, + const char *busname, const char *devname) +{ + /* Bus name is optional for simulated management device, so ignore the + * device with bus if bus attribute is provided. + */ + if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) + return false; + + if (!busname && strcmp(dev_name(mdev->device), devname) == 0) + return true; + + if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && + (strcmp(dev_name(mdev->device), devname) == 0)) + return true; + + return false; +} + +static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) +{ + struct vdpa_mgmt_dev *mdev; + const char *busname = NULL; + const char *devname; + + if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) + return ERR_PTR(-EINVAL); + devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); + if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) + busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); + + list_for_each_entry(mdev, &mdev_head, list) { + if (mgmtdev_handle_match(mdev, busname, devname)) + return mdev; + } + return ERR_PTR(-ENODEV); +} + +static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) +{ + if (mdev->device->bus && + nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) + return -EMSGSIZE; + return 0; +} + +static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, + u32 portid, u32 seq, int flags) +{ + u64 supported_classes = 0; + void *hdr; + int i = 0; + int err; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); + if (!hdr) + return -EMSGSIZE; + err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); + if (err) + goto msg_err; + + while (mdev->id_table[i].device) { + supported_classes |= BIT(mdev->id_table[i].device); + i++; + } + + if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, + supported_classes, VDPA_ATTR_UNSPEC)) { + err = -EMSGSIZE; + goto msg_err; + } + + genlmsg_end(msg, hdr); + return 0; + +msg_err: + genlmsg_cancel(msg, hdr); + return err; +} + +static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_mgmt_dev *mdev; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&vdpa_dev_mutex); + mdev = vdpa_mgmtdev_get_from_attr(info->attrs); + if (IS_ERR(mdev)) { + mutex_unlock(&vdpa_dev_mutex); + NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); + err = PTR_ERR(mdev); + goto out; + } + + err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); + mutex_unlock(&vdpa_dev_mutex); + if (err) + goto out; + err = genlmsg_reply(msg, info); + return err; + +out: + nlmsg_free(msg); + return err; +} + +static int +vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct vdpa_mgmt_dev *mdev; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&vdpa_dev_mutex); + list_for_each_entry(mdev, &mdev_head, list) { + if (idx < start) { + idx++; + continue; + } + err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + mutex_unlock(&vdpa_dev_mutex); + cb->args[0] = idx; + return msg->len; +} + +static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { + [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, + [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, +}; + +static const struct genl_ops vdpa_nl_ops[] = { + { + .cmd = VDPA_CMD_MGMTDEV_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = vdpa_nl_cmd_mgmtdev_get_doit, + .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, + }, +}; + +static struct genl_family vdpa_nl_family __ro_after_init = { + .name = VDPA_GENL_NAME, + .version = VDPA_GENL_VERSION, + .maxattr = VDPA_ATTR_MAX, + .policy = vdpa_nl_policy, + .netnsok = false, + .module = THIS_MODULE, + .ops = vdpa_nl_ops, + .n_ops = ARRAY_SIZE(vdpa_nl_ops), +}; + static int vdpa_init(void) { - return bus_register(&vdpa_bus); + int err; + + err = bus_register(&vdpa_bus); + if (err) + return err; + err = genl_register_family(&vdpa_nl_family); + if (err) + goto err; + return 0; + +err: + bus_unregister(&vdpa_bus); + return err; } static void __exit vdpa_exit(void) { + genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 5700baa22356..6b8b4222bca6 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -35,6 +35,8 @@ struct vdpa_vq_state { u16 avail_index; }; +struct vdpa_mgmt_dev; + /** * vDPA device - representation of a vDPA device * @dev: underlying device @@ -335,4 +337,33 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, ops->get_config(vdev, offset, buf, len); } +/** + * vdpa_mgmtdev_ops - vdpa device ops + * @dev_add: Add a vdpa device using alloc and register + * @mdev: parent device to use for device addition + * @name: name of the new vdpa device + * Driver need to add a new device using _vdpa_register_device() + * after fully initializing the vdpa device. Driver must return 0 + * on success or appropriate error code. + * @dev_del: Remove a vdpa device using unregister + * @mdev: parent device to use for device removal + * @dev: vdpa device to remove + * Driver need to remove the specified device by calling + * _vdpa_unregister_device(). + */ +struct vdpa_mgmtdev_ops { + int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name); + void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev); +}; + +struct vdpa_mgmt_dev { + struct device *device; + const struct vdpa_mgmtdev_ops *ops; + const struct virtio_device_id *id_table; /* supported ids */ + struct list_head list; +}; + +int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev); +void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev); + #endif /* _LINUX_VDPA_H */ diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h new file mode 100644 index 000000000000..d44d82e567b1 --- /dev/null +++ b/include/uapi/linux/vdpa.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * vdpa device management interface + * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef _UAPI_LINUX_VDPA_H_ +#define _UAPI_LINUX_VDPA_H_ + +#define VDPA_GENL_NAME "vdpa" +#define VDPA_GENL_VERSION 0x1 + +enum vdpa_command { + VDPA_CMD_UNSPEC, + VDPA_CMD_MGMTDEV_NEW, + VDPA_CMD_MGMTDEV_GET, /* can dump */ +}; + +enum vdpa_attr { + VDPA_ATTR_UNSPEC, + + /* bus name (optional) + dev name together make the parent device handle */ + VDPA_ATTR_MGMTDEV_BUS_NAME, /* string */ + VDPA_ATTR_MGMTDEV_DEV_NAME, /* string */ + VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ + + /* new attributes must be added above here */ + VDPA_ATTR_MAX, +}; + +#endif From 903f7bcaedb84ca47998e609015a34ddde93742e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:01 +0200 Subject: [PATCH 288/506] vdpa: Enable a user to add and delete a vdpa device Add the ability to add and delete a vdpa device. Examples: Create a vdpa device of type network named "foo2" from the management device vdpasim: $ vdpa dev add mgmtdev vdpasim_net name foo2 Delete the vdpa device after its use: $ vdpa dev del foo2 Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 143 +++++++++++++++++++++++++++++++++++--- include/linux/vdpa.h | 6 ++ include/uapi/linux/vdpa.h | 4 ++ 3 files changed, 143 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 586e66b7f671..f1228189814f 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -136,6 +136,37 @@ static int vdpa_name_match(struct device *dev, const void *data) return (strcmp(dev_name(&vdev->dev), data) == 0); } +static int __vdpa_register_device(struct vdpa_device *vdev) +{ + struct device *dev; + + lockdep_assert_held(&vdpa_dev_mutex); + dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); + if (dev) { + put_device(dev); + return -EEXIST; + } + return device_add(&vdev->dev); +} + +/** + * _vdpa_register_device - register a vDPA device with vdpa lock held + * Caller must have a succeed call of vdpa_alloc_device() before. + * Caller must invoke this routine in the management device dev_add() + * callback after setting up valid mgmtdev for this vdpa device. + * @vdev: the vdpa device to be registered to vDPA bus + * + * Returns an error when fail to add device to vDPA bus + */ +int _vdpa_register_device(struct vdpa_device *vdev) +{ + if (!vdev->mdev) + return -EINVAL; + + return __vdpa_register_device(vdev); +} +EXPORT_SYMBOL_GPL(_vdpa_register_device); + /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. @@ -145,24 +176,29 @@ static int vdpa_name_match(struct device *dev, const void *data) */ int vdpa_register_device(struct vdpa_device *vdev) { - struct device *dev; int err; mutex_lock(&vdpa_dev_mutex); - dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); - if (dev) { - put_device(dev); - err = -EEXIST; - goto name_err; - } - - err = device_add(&vdev->dev); -name_err: + err = __vdpa_register_device(vdev); mutex_unlock(&vdpa_dev_mutex); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); +/** + * _vdpa_unregister_device - unregister a vDPA device + * Caller must invoke this routine as part of management device dev_del() + * callback. + * @vdev: the vdpa device to be unregisted from vDPA bus + */ +void _vdpa_unregister_device(struct vdpa_device *vdev) +{ + lockdep_assert_held(&vdpa_dev_mutex); + WARN_ON(!vdev->mdev); + device_unregister(&vdev->dev); +} +EXPORT_SYMBOL_GPL(_vdpa_unregister_device); + /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus @@ -221,10 +257,25 @@ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); +static int vdpa_match_remove(struct device *dev, void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + struct vdpa_mgmt_dev *mdev = vdev->mdev; + + if (mdev == data) + mdev->ops->dev_del(mdev, vdev); + return 0; +} + void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { mutex_lock(&vdpa_dev_mutex); + list_del(&mdev->list); + + /* Filter out all the entries belong to this management device and delete it. */ + bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); + mutex_unlock(&vdpa_dev_mutex); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); @@ -368,9 +419,69 @@ vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) return msg->len; } +static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_mgmt_dev *mdev; + const char *name; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + mutex_lock(&vdpa_dev_mutex); + mdev = vdpa_mgmtdev_get_from_attr(info->attrs); + if (IS_ERR(mdev)) { + NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); + err = PTR_ERR(mdev); + goto err; + } + + err = mdev->ops->dev_add(mdev, name); +err: + mutex_unlock(&vdpa_dev_mutex); + return err; +} + +static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_mgmt_dev *mdev; + struct vdpa_device *vdev; + struct device *dev; + const char *name; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + mutex_lock(&vdpa_dev_mutex); + dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); + err = -EINVAL; + goto mdev_err; + } + mdev = vdev->mdev; + mdev->ops->dev_del(mdev, vdev); +mdev_err: + put_device(dev); +dev_err: + mutex_unlock(&vdpa_dev_mutex); + return err; +} + static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, + [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, }; static const struct genl_ops vdpa_nl_ops[] = { @@ -380,6 +491,18 @@ static const struct genl_ops vdpa_nl_ops[] = { .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, + { + .cmd = VDPA_CMD_DEV_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = vdpa_nl_cmd_dev_add_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = VDPA_CMD_DEV_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = vdpa_nl_cmd_dev_del_set_doit, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family vdpa_nl_family __ro_after_init = { diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 6b8b4222bca6..4ab5494503a8 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -45,6 +45,8 @@ struct vdpa_mgmt_dev; * @index: device index * @features_valid: were features initialized? for legacy guests * @nvqs: maximum number of supported virtqueues + * @mdev: management device pointer; caller must setup when registering device as part + * of dev_add() mgmtdev ops callback before invoking _vdpa_register_device(). */ struct vdpa_device { struct device dev; @@ -53,6 +55,7 @@ struct vdpa_device { unsigned int index; bool features_valid; int nvqs; + struct vdpa_mgmt_dev *mdev; }; /** @@ -260,6 +263,9 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, int vdpa_register_device(struct vdpa_device *vdev); void vdpa_unregister_device(struct vdpa_device *vdev); +int _vdpa_register_device(struct vdpa_device *vdev); +void _vdpa_unregister_device(struct vdpa_device *vdev); + /** * vdpa_driver - operations for a vDPA driver * @driver: underlying device driver diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index d44d82e567b1..bb4a1f00eb1c 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -14,6 +14,8 @@ enum vdpa_command { VDPA_CMD_UNSPEC, VDPA_CMD_MGMTDEV_NEW, VDPA_CMD_MGMTDEV_GET, /* can dump */ + VDPA_CMD_DEV_NEW, + VDPA_CMD_DEV_DEL, }; enum vdpa_attr { @@ -24,6 +26,8 @@ enum vdpa_attr { VDPA_ATTR_MGMTDEV_DEV_NAME, /* string */ VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ + VDPA_ATTR_DEV_NAME, /* string */ + /* new attributes must be added above here */ VDPA_ATTR_MAX, }; From bc0d90ee021f1baecd6aaa010d787eb373aa74dd Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:02 +0200 Subject: [PATCH 289/506] vdpa: Enable user to query vdpa device info Enable user to query vdpa device information. $ vdpa dev add mgmtdev vdpasim_net name foo2 Show the newly created vdpa device by its name: $ vdpa dev show foo2 foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256 $ vdpa dev show foo2 -jp { "dev": { "foo2": { "type": "network", "mgmtdev": "vdpasim_net", "vendor_id": 0, "max_vqs": 2, "max_vq_size": 256 } } } Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-6-parav@nvidia.com Including a memory leak fix: Link: https://lore.kernel.org/r/20210217060614.59561-1-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 131 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vdpa.h | 5 ++ 2 files changed, 136 insertions(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index f1228189814f..da67f07e24fd 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -478,6 +478,131 @@ static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *i return err; } +static int +vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + u16 max_vq_size; + u32 device_id; + u32 vendor_id; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); + if (!hdr) + return -EMSGSIZE; + + err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); + if (err) + goto msg_err; + + device_id = vdev->config->get_device_id(vdev); + vendor_id = vdev->config->get_vendor_id(vdev); + max_vq_size = vdev->config->get_vq_num_max(vdev); + + err = -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) + goto msg_err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) + goto msg_err; + if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) + goto msg_err; + + genlmsg_end(msg, hdr); + return 0; + +msg_err: + genlmsg_cancel(msg, hdr); + return err; +} + +static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct vdpa_device *vdev; + struct sk_buff *msg; + const char *devname; + struct device *dev; + int err; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&vdpa_dev_mutex); + dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + err = -EINVAL; + goto mdev_err; + } + err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); + if (!err) + err = genlmsg_reply(msg, info); +mdev_err: + put_device(dev); +err: + mutex_unlock(&vdpa_dev_mutex); + if (err) + nlmsg_free(msg); + return err; +} + +struct vdpa_dev_dump_info { + struct sk_buff *msg; + struct netlink_callback *cb; + int start_idx; + int idx; +}; + +static int vdpa_dev_dump(struct device *dev, void *data) +{ + struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); + struct vdpa_dev_dump_info *info = data; + int err; + + if (!vdev->mdev) + return 0; + if (info->idx < info->start_idx) { + info->idx++; + return 0; + } + err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, + info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); + if (err) + return err; + + info->idx++; + return 0; +} + +static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct vdpa_dev_dump_info info; + + info.msg = msg; + info.cb = cb; + info.start_idx = cb->args[0]; + info.idx = 0; + + mutex_lock(&vdpa_dev_mutex); + bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); + mutex_unlock(&vdpa_dev_mutex); + cb->args[0] = info.idx; + return msg->len; +} + static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, @@ -503,6 +628,12 @@ static const struct genl_ops vdpa_nl_ops[] = { .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, + { + .cmd = VDPA_CMD_DEV_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = vdpa_nl_cmd_dev_get_doit, + .dumpit = vdpa_nl_cmd_dev_get_dumpit, + }, }; static struct genl_family vdpa_nl_family __ro_after_init = { diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index bb4a1f00eb1c..66a41e4ec163 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -16,6 +16,7 @@ enum vdpa_command { VDPA_CMD_MGMTDEV_GET, /* can dump */ VDPA_CMD_DEV_NEW, VDPA_CMD_DEV_DEL, + VDPA_CMD_DEV_GET, /* can dump */ }; enum vdpa_attr { @@ -27,6 +28,10 @@ enum vdpa_attr { VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ VDPA_ATTR_DEV_NAME, /* string */ + VDPA_ATTR_DEV_ID, /* u32 */ + VDPA_ATTR_DEV_VENDOR_ID, /* u32 */ + VDPA_ATTR_DEV_MAX_VQS, /* u32 */ + VDPA_ATTR_DEV_MAX_VQ_SIZE, /* u16 */ /* new attributes must be added above here */ VDPA_ATTR_MAX, From a3c06ae158dd6fa8336157c31d9234689d068d02 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:03 +0200 Subject: [PATCH 290/506] vdpa_sim_net: Add support for user supported devices Enable user to create vdpasim net simulate devices. Show vdpa management device that supports creating, deleting vdpa devices. $ vdpa mgmtdev show vdpasim_net: supported_classes net $ vdpa mgmtdev show -jp { "show": { "vdpasim_net": { "supported_classes": { "net" } } } Create a vdpa device of type networking named as "foo2" from the management device vdpasim: $ vdpa dev add mgmtdev vdpasim_net name foo2 Show the newly created vdpa device by its name: $ vdpa dev show foo2 foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256 $ vdpa dev show foo2 -jp { "dev": { "foo2": { "type": "network", "mgmtdev": "vdpasim_net", "vendor_id": 0, "max_vqs": 2, "max_vq_size": 256 } } } Delete the vdpa device after its use: $ vdpa dev del foo2 Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-7-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 +- drivers/vdpa/vdpa_sim/vdpa_sim.h | 2 + drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 98 ++++++++++++++++++++-------- 3 files changed, 76 insertions(+), 27 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index db1636a99ba4..d5942842432d 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - dev_attr->nvqs, NULL); + dev_attr->nvqs, dev_attr->name); if (!vdpasim) goto err_alloc; @@ -249,6 +249,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); + vdpasim->vdpa.mdev = dev_attr->mgmt_dev; vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); if (!vdpasim->config) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index b02142293d5b..6d75444f9948 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -33,6 +33,8 @@ struct vdpasim_virtqueue { }; struct vdpasim_dev_attr { + struct vdpa_mgmt_dev *mgmt_dev; + const char *name; u64 supported_features; size_t config_size; size_t buffer_size; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index f0482427186b..d344c5b7c914 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -35,8 +35,6 @@ MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); static u8 macaddr_buf[ETH_ALEN]; -static struct vdpasim *vdpasim_net_dev; - static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -120,21 +118,23 @@ static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) memcpy(net_config->mac, macaddr_buf, ETH_ALEN); } -static int __init vdpasim_net_init(void) +static void vdpasim_net_mgmtdev_release(struct device *dev) +{ +} + +static struct device vdpasim_net_mgmtdev = { + .init_name = "vdpasim_net", + .release = vdpasim_net_mgmtdev_release, +}; + +static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name) { struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim *simdev; int ret; - if (macaddr) { - mac_pton(macaddr, macaddr_buf); - if (!is_valid_ether_addr(macaddr_buf)) { - ret = -EADDRNOTAVAIL; - goto out; - } - } else { - eth_random_addr(macaddr_buf); - } - + dev_attr.mgmt_dev = mdev; + dev_attr.name = name; dev_attr.id = VIRTIO_ID_NET; dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_NET_VQ_NUM; @@ -143,29 +143,75 @@ static int __init vdpasim_net_init(void) dev_attr.work_fn = vdpasim_net_work; dev_attr.buffer_size = PAGE_SIZE; - vdpasim_net_dev = vdpasim_create(&dev_attr); - if (IS_ERR(vdpasim_net_dev)) { - ret = PTR_ERR(vdpasim_net_dev); - goto out; - } + simdev = vdpasim_create(&dev_attr); + if (IS_ERR(simdev)) + return PTR_ERR(simdev); - ret = vdpa_register_device(&vdpasim_net_dev->vdpa); + ret = _vdpa_register_device(&simdev->vdpa); if (ret) - goto put_dev; + goto reg_err; return 0; -put_dev: - put_device(&vdpasim_net_dev->vdpa.dev); -out: +reg_err: + put_device(&simdev->vdpa.dev); + return ret; +} + +static void vdpasim_net_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *dev) +{ + struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa); + + _vdpa_unregister_device(&simdev->vdpa); +} + +static const struct vdpa_mgmtdev_ops vdpasim_net_mgmtdev_ops = { + .dev_add = vdpasim_net_dev_add, + .dev_del = vdpasim_net_dev_del +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct vdpa_mgmt_dev mgmt_dev = { + .device = &vdpasim_net_mgmtdev, + .id_table = id_table, + .ops = &vdpasim_net_mgmtdev_ops, +}; + +static int __init vdpasim_net_init(void) +{ + int ret; + + if (macaddr) { + mac_pton(macaddr, macaddr_buf); + if (!is_valid_ether_addr(macaddr_buf)) + return -EADDRNOTAVAIL; + } else { + eth_random_addr(macaddr_buf); + } + + ret = device_register(&vdpasim_net_mgmtdev); + if (ret) + return ret; + + ret = vdpa_mgmtdev_register(&mgmt_dev); + if (ret) + goto parent_err; + return 0; + +parent_err: + device_unregister(&vdpasim_net_mgmtdev); return ret; } static void __exit vdpasim_net_exit(void) { - struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa; - - vdpa_unregister_device(vdpa); + vdpa_mgmtdev_unregister(&mgmt_dev); + device_unregister(&vdpasim_net_mgmtdev); } module_init(vdpasim_net_init); From 489084dd3f7e4bd649814bd62839aef4456659e8 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sat, 23 Jan 2021 00:08:53 -0800 Subject: [PATCH 291/506] vhost scsi: alloc vhost_scsi with kvzalloc() to avoid delay The size of 'struct vhost_scsi' is order-10 (~2.3MB). It may take long time delay by kzalloc() to compact memory pages by retrying multiple times when there is a lack of high-order pages. As a result, there is latency to create a VM (with vhost-scsi) or to hotadd vhost-scsi-based storage. The prior commit 595cb754983d ("vhost/scsi: use vmalloc for order-10 allocation") prefers to fallback only when really needed, while this patch allocates with kvzalloc() with __GFP_NORETRY implicitly set to avoid retrying memory pages compact for multiple times. The __GFP_NORETRY is implicitly set if the size to allocate is more than PAGE_SZIE and when __GFP_RETRY_MAYFAIL is not explicitly set. Cc: Aruna Ramakrishna Cc: Joe Jin Signed-off-by: Dongli Zhang Link: https://lore.kernel.org/r/20210123080853.4214-1-dongli.zhang@oracle.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/scsi.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 4ce9f00ae10e..5de21ad4bd05 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1814,12 +1814,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) struct vhost_virtqueue **vqs; int r = -ENOMEM, i; - vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); - if (!vs) { - vs = vzalloc(sizeof(*vs)); - if (!vs) - goto err_vs; - } + vs = kvzalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) + goto err_vs; vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL); if (!vqs) From 64f2087aaa2c2a504f637736f48e71da0cd4afe0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:45 +0800 Subject: [PATCH 292/506] virtio-pci: do not access iomem via struct virtio_pci_device directly Instead of accessing iomem via struct virito_pci_device directly, tweak to call the io accessors through the iomem structure. This will ease the splitting of modern virtio device logic. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 76 ++++++++++++++++++------------ 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3d6ae5a5e252..df1481fd400c 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -141,12 +141,13 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; u64 features; - vp_iowrite32(0, &vp_dev->common->device_feature_select); - features = vp_ioread32(&vp_dev->common->device_feature); - vp_iowrite32(1, &vp_dev->common->device_feature_select); - features |= ((u64)vp_ioread32(&vp_dev->common->device_feature) << 32); + vp_iowrite32(0, &cfg->device_feature_select); + features = vp_ioread32(&cfg->device_feature); + vp_iowrite32(1, &cfg->device_feature_select); + features |= ((u64)vp_ioread32(&cfg->device_feature) << 32); return features; } @@ -165,6 +166,7 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) static int vp_finalize_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; u64 features = vdev->features; /* Give virtio_ring a chance to accept features. */ @@ -179,10 +181,10 @@ static int vp_finalize_features(struct virtio_device *vdev) return -EINVAL; } - vp_iowrite32(0, &vp_dev->common->guest_feature_select); - vp_iowrite32((u32)vdev->features, &vp_dev->common->guest_feature); - vp_iowrite32(1, &vp_dev->common->guest_feature_select); - vp_iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature); + vp_iowrite32(0, &cfg->guest_feature_select); + vp_iowrite32((u32)vdev->features, &cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + vp_iowrite32(vdev->features >> 32, &cfg->guest_feature); return 0; } @@ -192,6 +194,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *device = vp_dev->device; u8 b; __le16 w; __le32 l; @@ -200,21 +203,21 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, switch (len) { case 1: - b = ioread8(vp_dev->device + offset); + b = ioread8(device + offset); memcpy(buf, &b, sizeof b); break; case 2: - w = cpu_to_le16(ioread16(vp_dev->device + offset)); + w = cpu_to_le16(ioread16(device + offset)); memcpy(buf, &w, sizeof w); break; case 4: - l = cpu_to_le32(ioread32(vp_dev->device + offset)); + l = cpu_to_le32(ioread32(device + offset)); memcpy(buf, &l, sizeof l); break; case 8: - l = cpu_to_le32(ioread32(vp_dev->device + offset)); + l = cpu_to_le32(ioread32(device + offset)); memcpy(buf, &l, sizeof l); - l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l)); + l = cpu_to_le32(ioread32(device + offset + sizeof l)); memcpy(buf + sizeof l, &l, sizeof l); break; default: @@ -228,6 +231,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *device = vp_dev->device; u8 b; __le16 w; __le32 l; @@ -237,21 +241,21 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, switch (len) { case 1: memcpy(&b, buf, sizeof b); - iowrite8(b, vp_dev->device + offset); + iowrite8(b, device + offset); break; case 2: memcpy(&w, buf, sizeof w); - iowrite16(le16_to_cpu(w), vp_dev->device + offset); + iowrite16(le16_to_cpu(w), device + offset); break; case 4: memcpy(&l, buf, sizeof l); - iowrite32(le32_to_cpu(l), vp_dev->device + offset); + iowrite32(le32_to_cpu(l), device + offset); break; case 8: memcpy(&l, buf, sizeof l); - iowrite32(le32_to_cpu(l), vp_dev->device + offset); + iowrite32(le32_to_cpu(l), device + offset); memcpy(&l, buf + sizeof l, sizeof l); - iowrite32(le32_to_cpu(l), vp_dev->device + offset + sizeof l); + iowrite32(le32_to_cpu(l), device + offset + sizeof l); break; default: BUG(); @@ -261,35 +265,43 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, static u32 vp_generation(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - return vp_ioread8(&vp_dev->common->config_generation); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + + return vp_ioread8(&cfg->config_generation); } /* config->{get,set}_status() implementations */ static u8 vp_get_status(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - return vp_ioread8(&vp_dev->common->device_status); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + + return vp_ioread8(&cfg->device_status); } static void vp_set_status(struct virtio_device *vdev, u8 status) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + /* We should never be setting status to 0. */ BUG_ON(status == 0); - vp_iowrite8(status, &vp_dev->common->device_status); + vp_iowrite8(status, &cfg->device_status); } static void vp_reset(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + /* 0 status means a reset. */ - vp_iowrite8(0, &vp_dev->common->device_status); + vp_iowrite8(0, &cfg->device_status); /* After writing 0 to device_status, the driver MUST wait for a read of * device_status to return 0 before reinitializing the device. * This will flush out the status write, and flush in device writes, * including MSI-X interrupts, if any. */ - while (vp_ioread8(&vp_dev->common->device_status)) + while (vp_ioread8(&cfg->device_status)) msleep(1); /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); @@ -297,11 +309,13 @@ static void vp_reset(struct virtio_device *vdev) static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) { + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + /* Setup the vector used for configuration events */ - vp_iowrite16(vector, &vp_dev->common->msix_config); + vp_iowrite16(vector, &cfg->msix_config); /* Verify we had enough resources to assign the vector */ /* Will also flush the write out to device */ - return vp_ioread16(&vp_dev->common->msix_config); + return vp_ioread16(&cfg->msix_config); } static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, @@ -407,6 +421,7 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; struct virtqueue *vq; int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); @@ -417,8 +432,8 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, * this, there's no way to go back except reset. */ list_for_each_entry(vq, &vdev->vqs, list) { - vp_iowrite16(vq->index, &vp_dev->common->queue_select); - vp_iowrite16(1, &vp_dev->common->queue_enable); + vp_iowrite16(vq->index, &cfg->queue_select); + vp_iowrite16(1, &cfg->queue_enable); } return 0; @@ -428,14 +443,15 @@ static void del_vq(struct virtio_pci_vq_info *info) { struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; - vp_iowrite16(vq->index, &vp_dev->common->queue_select); + vp_iowrite16(vq->index, &cfg->queue_select); if (vp_dev->msix_enabled) { vp_iowrite16(VIRTIO_MSI_NO_VECTOR, - &vp_dev->common->queue_msix_vector); + &cfg->queue_msix_vector); /* Flush the write out to device */ - vp_ioread16(&vp_dev->common->queue_msix_vector); + vp_ioread16(&cfg->queue_msix_vector); } if (!vp_dev->notify_base) From b5d58094508724970ed9b68cdeca01e8f5333e0e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:46 +0800 Subject: [PATCH 293/506] virtio-pci: split out modern device This patch splits out the virtio-pci modern device only attributes into another structure. While at it, a dedicated probe method for modern only attributes is introduced. This may help for split the logic into a dedicated module. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-3-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.h | 25 +++-- drivers/virtio/virtio_pci_modern.c | 157 ++++++++++++++++------------- 2 files changed, 104 insertions(+), 78 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index b2f0eb4067cb..f35ff5b6b467 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -39,22 +39,16 @@ struct virtio_pci_vq_info { unsigned msix_vector; }; -/* Our device structure */ -struct virtio_pci_device { - struct virtio_device vdev; +struct virtio_pci_modern_device { struct pci_dev *pci_dev; - /* In legacy mode, these two point to within ->legacy. */ - /* Where to read and clear interrupt */ - u8 __iomem *isr; - - /* Modern only fields */ - /* The IO mapping for the PCI config space (non-legacy mode) */ struct virtio_pci_common_cfg __iomem *common; /* Device-specific data (non-legacy mode) */ void __iomem *device; /* Base of vq notifications (non-legacy mode). */ void __iomem *notify_base; + /* Where to read and clear interrupt */ + u8 __iomem *isr; /* So we can sanity-check accesses. */ size_t notify_len; @@ -68,6 +62,19 @@ struct virtio_pci_device { int modern_bars; + struct virtio_device_id id; +}; + +/* Our device structure */ +struct virtio_pci_device { + struct virtio_device vdev; + struct pci_dev *pci_dev; + struct virtio_pci_modern_device mdev; + + /* In legacy mode, these two point to within ->legacy. */ + /* Where to read and clear interrupt */ + u8 __iomem *isr; + /* Legacy only field */ /* the IO mapping for the PCI config space */ void __iomem *ioaddr; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index df1481fd400c..524490a94ca4 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -141,7 +141,8 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; u64 features; vp_iowrite32(0, &cfg->device_feature_select); @@ -166,7 +167,8 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) static int vp_finalize_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; u64 features = vdev->features; /* Give virtio_ring a chance to accept features. */ @@ -194,12 +196,13 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - void __iomem *device = vp_dev->device; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + void __iomem *device = mdev->device; u8 b; __le16 w; __le32 l; - BUG_ON(offset + len > vp_dev->device_len); + BUG_ON(offset + len > mdev->device_len); switch (len) { case 1: @@ -231,12 +234,13 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - void __iomem *device = vp_dev->device; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + void __iomem *device = mdev->device; u8 b; __le16 w; __le32 l; - BUG_ON(offset + len > vp_dev->device_len); + BUG_ON(offset + len > mdev->device_len); switch (len) { case 1: @@ -265,7 +269,8 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, static u32 vp_generation(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; return vp_ioread8(&cfg->config_generation); } @@ -274,7 +279,8 @@ static u32 vp_generation(struct virtio_device *vdev) static u8 vp_get_status(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; return vp_ioread8(&cfg->device_status); } @@ -282,7 +288,8 @@ static u8 vp_get_status(struct virtio_device *vdev) static void vp_set_status(struct virtio_device *vdev, u8 status) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* We should never be setting status to 0. */ BUG_ON(status == 0); @@ -292,7 +299,8 @@ static void vp_set_status(struct virtio_device *vdev, u8 status) static void vp_reset(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* 0 status means a reset. */ vp_iowrite8(0, &cfg->device_status); @@ -309,7 +317,8 @@ static void vp_reset(struct virtio_device *vdev) static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) { - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* Setup the vector used for configuration events */ vp_iowrite16(vector, &cfg->msix_config); @@ -326,7 +335,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, bool ctx, u16 msix_vec) { - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; struct virtqueue *vq; u16 num, off; int err; @@ -369,25 +380,25 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vp_iowrite64_twopart(virtqueue_get_used_addr(vq), &cfg->queue_used_lo, &cfg->queue_used_hi); - if (vp_dev->notify_base) { + if (mdev->notify_base) { /* offset should not wrap */ - if ((u64)off * vp_dev->notify_offset_multiplier + 2 - > vp_dev->notify_len) { - dev_warn(&vp_dev->pci_dev->dev, + if ((u64)off * mdev->notify_offset_multiplier + 2 + > mdev->notify_len) { + dev_warn(&mdev->pci_dev->dev, "bad notification offset %u (x %u) " "for queue %u > %zd", - off, vp_dev->notify_offset_multiplier, - index, vp_dev->notify_len); + off, mdev->notify_offset_multiplier, + index, mdev->notify_len); err = -EINVAL; goto err_map_notify; } - vq->priv = (void __force *)vp_dev->notify_base + - off * vp_dev->notify_offset_multiplier; + vq->priv = (void __force *)mdev->notify_base + + off * mdev->notify_offset_multiplier; } else { - vq->priv = (void __force *)map_capability(vp_dev->pci_dev, - vp_dev->notify_map_cap, 2, 2, - off * vp_dev->notify_offset_multiplier, 2, - NULL); + vq->priv = (void __force *)map_capability(mdev->pci_dev, + mdev->notify_map_cap, 2, 2, + off * mdev->notify_offset_multiplier, 2, + NULL); } if (!vq->priv) { @@ -407,8 +418,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, return vq; err_assign_vector: - if (!vp_dev->notify_base) - pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv); + if (!mdev->notify_base) + pci_iounmap(mdev->pci_dev, (void __iomem __force *)vq->priv); err_map_notify: vring_del_virtqueue(vq); return ERR_PTR(err); @@ -421,7 +432,7 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common; struct virtqueue *vq; int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); @@ -443,7 +454,9 @@ static void del_vq(struct virtio_pci_vq_info *info) { struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + vp_iowrite16(vq->index, &cfg->queue_select); @@ -454,8 +467,8 @@ static void del_vq(struct virtio_pci_vq_info *info) vp_ioread16(&cfg->queue_msix_vector); } - if (!vp_dev->notify_base) - pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv); + if (!mdev->notify_base) + pci_iounmap(mdev->pci_dev, (void __force __iomem *)vq->priv); vring_del_virtqueue(vq); } @@ -693,6 +706,7 @@ static inline void check_offsets(void) /* the PCI probing function */ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) { + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct pci_dev *pci_dev = vp_dev->pci_dev; int err, common, isr, notify, device; u32 notify_length; @@ -700,6 +714,8 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) check_offsets(); + mdev->pci_dev = pci_dev; + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) return -ENODEV; @@ -708,17 +724,17 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) /* Transitional devices: use the PCI subsystem device id as * virtio device id, same as legacy driver always did. */ - vp_dev->vdev.id.device = pci_dev->subsystem_device; + mdev->id.device = pci_dev->subsystem_device; } else { /* Modern devices: simply use PCI device id, but start from 0x1040. */ - vp_dev->vdev.id.device = pci_dev->device - 0x1040; + mdev->id.device = pci_dev->device - 0x1040; } - vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + mdev->id.vendor = pci_dev->subsystem_vendor; /* check for a common config: if not, use legacy mode (bar 0). */ common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, IORESOURCE_IO | IORESOURCE_MEM, - &vp_dev->modern_bars); + &mdev->modern_bars); if (!common) { dev_info(&pci_dev->dev, "virtio_pci: leaving for legacy driver\n"); @@ -728,10 +744,10 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) /* If common is there, these should be too... */ isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, IORESOURCE_IO | IORESOURCE_MEM, - &vp_dev->modern_bars); + &mdev->modern_bars); notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, IORESOURCE_IO | IORESOURCE_MEM, - &vp_dev->modern_bars); + &mdev->modern_bars); if (!isr || !notify) { dev_err(&pci_dev->dev, "virtio_pci: missing capabilities %i/%i/%i\n", @@ -751,31 +767,31 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) */ device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, IORESOURCE_IO | IORESOURCE_MEM, - &vp_dev->modern_bars); + &mdev->modern_bars); - err = pci_request_selected_regions(pci_dev, vp_dev->modern_bars, + err = pci_request_selected_regions(pci_dev, mdev->modern_bars, "virtio-pci-modern"); if (err) return err; err = -EINVAL; - vp_dev->common = map_capability(pci_dev, common, - sizeof(struct virtio_pci_common_cfg), 4, - 0, sizeof(struct virtio_pci_common_cfg), - NULL); - if (!vp_dev->common) + mdev->common = map_capability(pci_dev, common, + sizeof(struct virtio_pci_common_cfg), 4, + 0, sizeof(struct virtio_pci_common_cfg), + NULL); + if (!mdev->common) goto err_map_common; - vp_dev->isr = map_capability(pci_dev, isr, sizeof(u8), 1, - 0, 1, - NULL); - if (!vp_dev->isr) + mdev->isr = map_capability(pci_dev, isr, sizeof(u8), 1, + 0, 1, + NULL); + if (!mdev->isr) goto err_map_isr; /* Read notify_off_multiplier from config space. */ pci_read_config_dword(pci_dev, notify + offsetof(struct virtio_pci_notify_cap, notify_off_multiplier), - &vp_dev->notify_offset_multiplier); + &mdev->notify_offset_multiplier); /* Read notify length and offset from config space. */ pci_read_config_dword(pci_dev, notify + offsetof(struct virtio_pci_notify_cap, @@ -792,23 +808,23 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) * Otherwise, map each VQ individually later. */ if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { - vp_dev->notify_base = map_capability(pci_dev, notify, 2, 2, - 0, notify_length, - &vp_dev->notify_len); - if (!vp_dev->notify_base) + mdev->notify_base = map_capability(pci_dev, notify, 2, 2, + 0, notify_length, + &mdev->notify_len); + if (!mdev->notify_base) goto err_map_notify; } else { - vp_dev->notify_map_cap = notify; + mdev->notify_map_cap = notify; } /* Again, we don't know how much we should map, but PAGE_SIZE * is more than enough for all existing devices. */ if (device) { - vp_dev->device = map_capability(pci_dev, device, 0, 4, - 0, PAGE_SIZE, - &vp_dev->device_len); - if (!vp_dev->device) + mdev->device = map_capability(pci_dev, device, 0, 4, + 0, PAGE_SIZE, + &mdev->device_len); + if (!mdev->device) goto err_map_device; vp_dev->vdev.config = &virtio_pci_config_ops; @@ -819,29 +835,32 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->config_vector = vp_config_vector; vp_dev->setup_vq = setup_vq; vp_dev->del_vq = del_vq; + vp_dev->isr = mdev->isr; + vp_dev->vdev.id = mdev->id; return 0; err_map_device: - if (vp_dev->notify_base) - pci_iounmap(pci_dev, vp_dev->notify_base); + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); err_map_notify: - pci_iounmap(pci_dev, vp_dev->isr); + pci_iounmap(pci_dev, mdev->isr); err_map_isr: - pci_iounmap(pci_dev, vp_dev->common); + pci_iounmap(pci_dev, mdev->common); err_map_common: return err; } void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { - struct pci_dev *pci_dev = vp_dev->pci_dev; + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct pci_dev *pci_dev = mdev->pci_dev; - if (vp_dev->device) - pci_iounmap(pci_dev, vp_dev->device); - if (vp_dev->notify_base) - pci_iounmap(pci_dev, vp_dev->notify_base); - pci_iounmap(pci_dev, vp_dev->isr); - pci_iounmap(pci_dev, vp_dev->common); - pci_release_selected_regions(pci_dev, vp_dev->modern_bars); + if (mdev->device) + pci_iounmap(pci_dev, mdev->device); + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); + pci_iounmap(pci_dev, mdev->isr); + pci_iounmap(pci_dev, mdev->common); + pci_release_selected_regions(pci_dev, mdev->modern_bars); } From 117a9de2826ccb0d338afb03f07223b3cb789371 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:47 +0800 Subject: [PATCH 294/506] virtio-pci-modern: factor out modern device initialization logic This patch factors out the modern device initialization logic into a helper. Note that it still depends on the caller to enable pci device which allows the caller to use e.g devres. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 50 +++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 524490a94ca4..5d2d2ae0dfdb 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -703,11 +703,16 @@ static inline void check_offsets(void) offsetof(struct virtio_pci_common_cfg, queue_used_hi)); } -/* the PCI probing function */ -int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) +/* + * vp_modern_probe: probe the modern virtio pci device, note that the + * caller is required to enable PCI device before calling this function. + * @mdev: the modern virtio-pci device + * + * Return 0 on succeed otherwise fail + */ +static int vp_modern_probe(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct pci_dev *pci_dev = vp_dev->pci_dev; + struct pci_dev *pci_dev = mdev->pci_dev; int err, common, isr, notify, device; u32 notify_length; u32 notify_offset; @@ -826,18 +831,8 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) &mdev->device_len); if (!mdev->device) goto err_map_device; - - vp_dev->vdev.config = &virtio_pci_config_ops; - } else { - vp_dev->vdev.config = &virtio_pci_config_nodev_ops; } - vp_dev->config_vector = vp_config_vector; - vp_dev->setup_vq = setup_vq; - vp_dev->del_vq = del_vq; - vp_dev->isr = mdev->isr; - vp_dev->vdev.id = mdev->id; - return 0; err_map_device: @@ -851,6 +846,33 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) return err; } +/* the PCI probing function */ +int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct pci_dev *pci_dev = vp_dev->pci_dev; + int err; + + mdev->pci_dev = pci_dev; + + err = vp_modern_probe(mdev); + if (err) + return err; + + if (mdev->device) + vp_dev->vdev.config = &virtio_pci_config_ops; + else + vp_dev->vdev.config = &virtio_pci_config_nodev_ops; + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + vp_dev->isr = mdev->isr; + vp_dev->vdev.id = mdev->id; + + return 0; +} + void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; From 32490370883822e9e8dd7e4410bbe22bb3b77b58 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:48 +0800 Subject: [PATCH 295/506] virtio-pci-modern: introduce vp_modern_remove() This patch introduces vp_modern_remove() doing device resources cleanup to make it can be used. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 5d2d2ae0dfdb..4be9afad547e 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -873,9 +873,12 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) return 0; } -void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) +/* + * vp_modern_probe: remove and cleanup the modern virtio pci device + * @mdev: the modern virtio-pci device + */ +static void vp_modern_remove(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct pci_dev *pci_dev = mdev->pci_dev; if (mdev->device) @@ -886,3 +889,10 @@ void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) pci_iounmap(pci_dev, mdev->common); pci_release_selected_regions(pci_dev, mdev->modern_bars); } + +void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + + vp_modern_remove(mdev); +} From 1a5c85f16594416df258fe38ec05e0dd05846479 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:49 +0800 Subject: [PATCH 296/506] virtio-pci-modern: introduce helper to set config vector This patch introduces vp_modern_config_vector() for setting config vector. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-6-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 4be9afad547e..2e37bfc89655 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -315,9 +315,16 @@ static void vp_reset(struct virtio_device *vdev) vp_synchronize_vectors(vdev); } -static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +/* + * vp_modern_config_vector - set the vector for config interrupt + * @mdev: the modern virtio-pci device + * @vector: the config vector + * + * Returns the config vector read from the device + */ +static u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev, + u16 vector) { - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* Setup the vector used for configuration events */ @@ -327,6 +334,11 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) return vp_ioread16(&cfg->msix_config); } +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + return vp_modern_config_vector(&vp_dev->mdev, vector); +} + static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_vq_info *info, unsigned index, From e3669129fdcd2494e6991bd4111e090551087e16 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:50 +0800 Subject: [PATCH 297/506] virtio-pci-modern: introduce helpers for setting and getting status This patch introduces helpers to allow set and get device status. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-7-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 41 ++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 2e37bfc89655..ccde0a41209a 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -275,41 +275,62 @@ static u32 vp_generation(struct virtio_device *vdev) return vp_ioread8(&cfg->config_generation); } -/* config->{get,set}_status() implementations */ -static u8 vp_get_status(struct virtio_device *vdev) +/* + * vp_modern_get_status - get the device status + * @mdev: the modern virtio-pci device + * + * Returns the status read from device + */ +static u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct virtio_pci_common_cfg __iomem *cfg = mdev->common; return vp_ioread8(&cfg->device_status); } +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_get_status(&vp_dev->mdev); +} + +/* + * vp_modern_set_status - set status to device + * @mdev: the modern virtio-pci device + * @status: the status set to device + */ +static void vp_modern_set_status(struct virtio_pci_modern_device *mdev, + u8 status) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite8(status, &cfg->device_status); +} + static void vp_set_status(struct virtio_device *vdev, u8 status) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* We should never be setting status to 0. */ BUG_ON(status == 0); - vp_iowrite8(status, &cfg->device_status); + vp_modern_set_status(&vp_dev->mdev, status); } static void vp_reset(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; /* 0 status means a reset. */ - vp_iowrite8(0, &cfg->device_status); + vp_modern_set_status(mdev, 0); /* After writing 0 to device_status, the driver MUST wait for a read of * device_status to return 0 before reinitializing the device. * This will flush out the status write, and flush in device writes, * including MSI-X interrupts, if any. */ - while (vp_ioread8(&cfg->device_status)) + while (vp_modern_get_status(mdev)) msleep(1); /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); From 0b0177089c60236b6a785346cfaab081acd9be26 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:51 +0800 Subject: [PATCH 298/506] virtio-pci-modern: introduce helpers for setting and getting features This patch introduces helpers for setting and getting features. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-8-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 43 +++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index ccde0a41209a..cb14fc334a9c 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -137,12 +137,16 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, return p; } -/* virtio config->get_features() implementation */ -static u64 vp_get_features(struct virtio_device *vdev) +/* + * vp_modern_get_features - get features from device + * @mdev: the modern virtio-pci device + * + * Returns the features read from the device + */ +static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + u64 features; vp_iowrite32(0, &cfg->device_feature_select); @@ -153,6 +157,14 @@ static u64 vp_get_features(struct virtio_device *vdev) return features; } +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_get_features(&vp_dev->mdev); +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -163,12 +175,26 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); } +/* + * vp_modern_set_features - set features to device + * @mdev: the modern virtio-pci device + * @features: the features set to device + */ +static void vp_modern_set_features(struct virtio_pci_modern_device *mdev, + u64 features) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite32(0, &cfg->guest_feature_select); + vp_iowrite32((u32)features, &cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + vp_iowrite32(features >> 32, &cfg->guest_feature); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; u64 features = vdev->features; /* Give virtio_ring a chance to accept features. */ @@ -183,10 +209,7 @@ static int vp_finalize_features(struct virtio_device *vdev) return -EINVAL; } - vp_iowrite32(0, &cfg->guest_feature_select); - vp_iowrite32((u32)vdev->features, &cfg->guest_feature); - vp_iowrite32(1, &cfg->guest_feature_select); - vp_iowrite32(vdev->features >> 32, &cfg->guest_feature); + vp_modern_set_features(&vp_dev->mdev, vdev->features); return 0; } From ed2a73dbab138b1af8501d48b4f57fa9be68f43c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:52 +0800 Subject: [PATCH 299/506] virtio-pci-modern: introduce vp_modern_generation() This patch introduces vp_modern_generation() to get device generation. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-9-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index cb14fc334a9c..a128e5814045 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -289,15 +289,26 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, } } -static u32 vp_generation(struct virtio_device *vdev) +/* + * vp_modern_generation - get the device genreation + * @mdev: the modern virtio-pci device + * + * Returns the genreation read from device + */ +static u32 vp_modern_generation(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct virtio_pci_common_cfg __iomem *cfg = mdev->common; return vp_ioread8(&cfg->config_generation); } +static u32 vp_generation(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_generation(&vp_dev->mdev); +} + /* * vp_modern_get_status - get the device status * @mdev: the modern virtio-pci device From 3fbda9c1a67522bba5c40e4710c1fa6ab6712d73 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:53 +0800 Subject: [PATCH 300/506] virtio-pci-modern: introduce vp_modern_set_queue_vector() This patch introduces a helper to set virtqueue MSI vector. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-10-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 35 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a128e5814045..05cd409c0731 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -191,6 +191,25 @@ static void vp_modern_set_features(struct virtio_pci_modern_device *mdev, vp_iowrite32(features >> 32, &cfg->guest_feature); } +/* + * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue + * @mdev: the modern virtio-pci device + * @index: queue index + * @vector: the config vector + * + * Returns the config vector read from the device + */ +static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, + u16 index, u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + vp_iowrite16(vector, &cfg->queue_msix_vector); + /* Flush the write out to device */ + return vp_ioread16(&cfg->queue_msix_vector); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -474,8 +493,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } if (msix_vec != VIRTIO_MSI_NO_VECTOR) { - vp_iowrite16(msix_vec, &cfg->queue_msix_vector); - msix_vec = vp_ioread16(&cfg->queue_msix_vector); + msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); if (msix_vec == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto err_assign_vector; @@ -522,17 +540,10 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - vp_iowrite16(vq->index, &cfg->queue_select); - - if (vp_dev->msix_enabled) { - vp_iowrite16(VIRTIO_MSI_NO_VECTOR, - &cfg->queue_msix_vector); - /* Flush the write out to device */ - vp_ioread16(&cfg->queue_msix_vector); - } + if (vp_dev->msix_enabled) + vp_modern_queue_vector(mdev, vq->index, + VIRTIO_MSI_NO_VECTOR); if (!mdev->notify_base) pci_iounmap(mdev->pci_dev, (void __force __iomem *)vq->priv); From e1b0fa2e386df72bc92e3ea03759e851fb533c97 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:54 +0800 Subject: [PATCH 301/506] virtio-pci-modern: introduce vp_modern_queue_address() This patch introduce a helper to set virtqueue address for modern address. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-11-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 33 ++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 05cd409c0731..05b21e18f46c 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -210,6 +210,30 @@ static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, return vp_ioread16(&cfg->queue_msix_vector); } +/* + * vp_modern_queue_address - set the virtqueue address + * @mdev: the modern virtio-pci device + * @index: the queue index + * @desc_addr: address of the descriptor area + * @driver_addr: address of the driver area + * @device_addr: address of the device area + */ +static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, + u16 index, u64 desc_addr, u64 driver_addr, + u64 device_addr) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + + vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, + &cfg->queue_used_hi); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -459,12 +483,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* activate the queue */ vp_iowrite16(virtqueue_get_vring_size(vq), &cfg->queue_size); - vp_iowrite64_twopart(virtqueue_get_desc_addr(vq), - &cfg->queue_desc_lo, &cfg->queue_desc_hi); - vp_iowrite64_twopart(virtqueue_get_avail_addr(vq), - &cfg->queue_avail_lo, &cfg->queue_avail_hi); - vp_iowrite64_twopart(virtqueue_get_used_addr(vq), - &cfg->queue_used_lo, &cfg->queue_used_hi); + vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), + virtqueue_get_avail_addr(vq), + virtqueue_get_used_addr(vq)); if (mdev->notify_base) { /* offset should not wrap */ From dc2e64819837ba927f2811d8ac95a027d931764d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:55 +0800 Subject: [PATCH 302/506] virtio-pci-modern: introduce helper to set/get queue_enable This patch introduces a helper to set/get queue_enable for modern device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-12-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 37 +++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 05b21e18f46c..0e62820b83ff 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -234,6 +234,34 @@ static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, &cfg->queue_used_hi); } +/* + * vp_modern_set_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @enable: whether the virtqueue is enable or not + */ +static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index, bool enable) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(enable, &mdev->common->queue_enable); +} + +/* + * vp_modern_get_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns whether a virtqueue is enabled or not + */ +static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_enable); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -460,7 +488,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* Check if queue is either not available or already active. */ num = vp_ioread16(&cfg->queue_size); - if (!num || vp_ioread16(&cfg->queue_enable)) + if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); if (num & (num - 1)) { @@ -538,7 +566,6 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common; struct virtqueue *vq; int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); @@ -548,10 +575,8 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, /* Select and activate all queues. Has to be done last: once we do * this, there's no way to go back except reset. */ - list_for_each_entry(vq, &vdev->vqs, list) { - vp_iowrite16(vq->index, &cfg->queue_select); - vp_iowrite16(1, &cfg->queue_enable); - } + list_for_each_entry(vq, &vdev->vqs, list) + vp_modern_set_queue_enable(&vp_dev->mdev, vq->index, true); return 0; } From 75658afbab57706c241ca7d60559ebefd631fc6f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:56 +0800 Subject: [PATCH 303/506] virtio-pci-modern: introduce helper for setting/geting queue size This patch introduces helper for setting/getting queue size for modern device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-13-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 34 ++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 0e62820b83ff..7a89226135af 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -262,6 +262,36 @@ static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, return vp_ioread16(&mdev->common->queue_enable); } +/* + * vp_modern_set_queue_size - set size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @size: the size of the virtqueue + */ +static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, + u16 index, u16 size) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(size, &mdev->common->queue_size); + +} + +/* + * vp_modern_get_queue_size - get size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the size of the virtqueue + */ +static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_size); + +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -487,7 +517,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vp_iowrite16(index, &cfg->queue_select); /* Check if queue is either not available or already active. */ - num = vp_ioread16(&cfg->queue_size); + num = vp_modern_get_queue_size(mdev, index); if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); @@ -510,7 +540,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, return ERR_PTR(-ENOMEM); /* activate the queue */ - vp_iowrite16(virtqueue_get_vring_size(vq), &cfg->queue_size); + vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), virtqueue_get_avail_addr(vq), virtqueue_get_used_addr(vq)); From 6e52fc446d32a82936f05106ffeef5cf8529e6c4 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:57 +0800 Subject: [PATCH 304/506] virtio-pci-modern: introduce helper for getting queue nums This patch introduces helper for getting queue num of modern device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-14-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 7a89226135af..bccad1329871 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -292,6 +292,17 @@ static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, } +/* + * vp_modern_get_num_queues - get the number of virtqueues + * @mdev: the modern virtio-pci device + * + * Returns the number of virtqueues + */ +static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) +{ + return vp_ioread16(&mdev->common->num_queues); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -510,7 +521,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, u16 num, off; int err; - if (index >= vp_ioread16(&cfg->num_queues)) + if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-ENOENT); /* Select the queue we're interested in */ From 1bfd84134c885799b8ac94766bff600b1f963ecf Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:58 +0800 Subject: [PATCH 305/506] virtio-pci-modern: introduce helper to get notification offset This patch introduces help to get notification offset of modern device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-15-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index bccad1329871..217573f2588d 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -303,6 +303,21 @@ static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) return vp_ioread16(&mdev->common->num_queues); } +/* + * vp_modern_get_queue_notify_off - get notification offset for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the notification offset for a virtqueue + */ +static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_notify_off); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -516,7 +531,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; struct virtqueue *vq; u16 num, off; int err; @@ -524,9 +538,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-ENOENT); - /* Select the queue we're interested in */ - vp_iowrite16(index, &cfg->queue_select); - /* Check if queue is either not available or already active. */ num = vp_modern_get_queue_size(mdev, index); if (!num || vp_modern_get_queue_enable(mdev, index)) @@ -538,7 +549,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } /* get offset of notification word for this vq */ - off = vp_ioread16(&cfg->queue_notify_off); + off = vp_modern_get_queue_notify_off(mdev, index); info->msix_vector = msix_vec; From 8000a6b602a4aec0f54a9131623a407d6d44a605 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:54:59 +0800 Subject: [PATCH 306/506] virito-pci-modern: rename map_capability() to vp_modern_map_capability() To ease the split, map_capability() was renamed to vp_modern_map_capability(). While at it, add the comments for the arguments and switch to use virtio_pci_modern_device as the first parameter. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-16-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 46 +++++++++++++++++++----------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 217573f2588d..a5e3a5e40323 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -63,12 +63,25 @@ static void vp_iowrite64_twopart(u64 val, vp_iowrite32(val >> 32, hi); } -static void __iomem *map_capability(struct pci_dev *dev, int off, - size_t minlen, - u32 align, - u32 start, u32 size, - size_t *len) +/* + * vp_modern_map_capability - map a part of virtio pci capability + * @mdev: the modern virtio-pci device + * @off: offset of the capability + * @minlen: minimal length of the capability + * @align: align requirement + * @start: start from the capability + * @size: map size + * @len: the length that is actually mapped + * + * Returns the io address of for the part of the capability + */ +void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, + size_t minlen, + u32 align, + u32 start, u32 size, + size_t *len) { + struct pci_dev *dev = mdev->pci_dev; u8 bar; u32 offset, length; void __iomem *p; @@ -582,7 +595,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vq->priv = (void __force *)mdev->notify_base + off * mdev->notify_offset_multiplier; } else { - vq->priv = (void __force *)map_capability(mdev->pci_dev, + vq->priv = (void __force *)vp_modern_map_capability(mdev, mdev->notify_map_cap, 2, 2, off * mdev->notify_offset_multiplier, 2, NULL); @@ -956,15 +969,15 @@ static int vp_modern_probe(struct virtio_pci_modern_device *mdev) return err; err = -EINVAL; - mdev->common = map_capability(pci_dev, common, + mdev->common = vp_modern_map_capability(mdev, common, sizeof(struct virtio_pci_common_cfg), 4, 0, sizeof(struct virtio_pci_common_cfg), NULL); if (!mdev->common) goto err_map_common; - mdev->isr = map_capability(pci_dev, isr, sizeof(u8), 1, - 0, 1, - NULL); + mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, + 0, 1, + NULL); if (!mdev->isr) goto err_map_isr; @@ -989,9 +1002,10 @@ static int vp_modern_probe(struct virtio_pci_modern_device *mdev) * Otherwise, map each VQ individually later. */ if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { - mdev->notify_base = map_capability(pci_dev, notify, 2, 2, - 0, notify_length, - &mdev->notify_len); + mdev->notify_base = vp_modern_map_capability(mdev, notify, + 2, 2, + 0, notify_length, + &mdev->notify_len); if (!mdev->notify_base) goto err_map_notify; } else { @@ -1002,9 +1016,9 @@ static int vp_modern_probe(struct virtio_pci_modern_device *mdev) * is more than enough for all existing devices. */ if (device) { - mdev->device = map_capability(pci_dev, device, 0, 4, - 0, PAGE_SIZE, - &mdev->device_len); + mdev->device = vp_modern_map_capability(mdev, device, 0, 4, + 0, PAGE_SIZE, + &mdev->device_len); if (!mdev->device) goto err_map_device; } From fd502729fbbf6a76fdb7acae4506486bfbb7c4f6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:55:00 +0800 Subject: [PATCH 307/506] virtio-pci: introduce modern device module Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-17-jasowang@redhat.com Including a bugfix: virtio: don't prompt CONFIG_VIRTIO_PCI_MODERN Cc: Arnd Bergmann Cc: Anders Roxell Cc: Guenter Roeck Reported-by: Naresh Kamboju Fixes: 86b87c9d858b6 ("virtio-pci: introduce modern device module") Signed-off-by: Jason Wang Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210223061905.422659-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/Kconfig | 9 + drivers/virtio/Makefile | 1 + drivers/virtio/virtio_pci_common.h | 27 +- drivers/virtio/virtio_pci_modern.c | 617 ------------------------- drivers/virtio/virtio_pci_modern_dev.c | 599 ++++++++++++++++++++++++ include/linux/virtio_pci_modern.h | 111 +++++ 6 files changed, 721 insertions(+), 643 deletions(-) create mode 100644 drivers/virtio/virtio_pci_modern_dev.c create mode 100644 include/linux/virtio_pci_modern.h diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 7b41130d3f35..ce1b3f6ec325 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -12,6 +12,14 @@ config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS This option is selected if the architecture may need to enforce VIRTIO_F_ACCESS_PLATFORM +config VIRTIO_PCI_LIB + tristate + help + Modern PCI device implementation. This module implements the + basic probe and control for devices which are based on modern + PCI device with possible vendor specific extensions. Any + module that selects this module must depend on PCI. + menuconfig VIRTIO_MENU bool "Virtio drivers" default y @@ -21,6 +29,7 @@ if VIRTIO_MENU config VIRTIO_PCI tristate "PCI driver for virtio devices" depends on PCI + select VIRTIO_PCI_LIB select VIRTIO help This driver provides support for virtio based paravirtual device diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 591e6f72aa54..699bbea0465f 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o +obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index f35ff5b6b467..beec047a8f8d 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -39,32 +40,6 @@ struct virtio_pci_vq_info { unsigned msix_vector; }; -struct virtio_pci_modern_device { - struct pci_dev *pci_dev; - - struct virtio_pci_common_cfg __iomem *common; - /* Device-specific data (non-legacy mode) */ - void __iomem *device; - /* Base of vq notifications (non-legacy mode). */ - void __iomem *notify_base; - /* Where to read and clear interrupt */ - u8 __iomem *isr; - - /* So we can sanity-check accesses. */ - size_t notify_len; - size_t device_len; - - /* Capability for when we need to map notifications per-vq. */ - int notify_map_cap; - - /* Multiply queue_notify_off by this value. (non-legacy mode). */ - u32 notify_offset_multiplier; - - int modern_bars; - - struct virtio_device_id id; -}; - /* Our device structure */ struct virtio_pci_device { struct virtio_device vdev; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a5e3a5e40323..fbd4ebc00eb6 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -19,158 +19,6 @@ #define VIRTIO_RING_NO_LEGACY #include "virtio_pci_common.h" -/* - * Type-safe wrappers for io accesses. - * Use these to enforce at compile time the following spec requirement: - * - * The driver MUST access each field using the “natural” access - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses - * for 16-bit fields and 8-bit accesses for 8-bit fields. - */ -static inline u8 vp_ioread8(const u8 __iomem *addr) -{ - return ioread8(addr); -} -static inline u16 vp_ioread16 (const __le16 __iomem *addr) -{ - return ioread16(addr); -} - -static inline u32 vp_ioread32(const __le32 __iomem *addr) -{ - return ioread32(addr); -} - -static inline void vp_iowrite8(u8 value, u8 __iomem *addr) -{ - iowrite8(value, addr); -} - -static inline void vp_iowrite16(u16 value, __le16 __iomem *addr) -{ - iowrite16(value, addr); -} - -static inline void vp_iowrite32(u32 value, __le32 __iomem *addr) -{ - iowrite32(value, addr); -} - -static void vp_iowrite64_twopart(u64 val, - __le32 __iomem *lo, __le32 __iomem *hi) -{ - vp_iowrite32((u32)val, lo); - vp_iowrite32(val >> 32, hi); -} - -/* - * vp_modern_map_capability - map a part of virtio pci capability - * @mdev: the modern virtio-pci device - * @off: offset of the capability - * @minlen: minimal length of the capability - * @align: align requirement - * @start: start from the capability - * @size: map size - * @len: the length that is actually mapped - * - * Returns the io address of for the part of the capability - */ -void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, - size_t minlen, - u32 align, - u32 start, u32 size, - size_t *len) -{ - struct pci_dev *dev = mdev->pci_dev; - u8 bar; - u32 offset, length; - void __iomem *p; - - pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, - bar), - &bar); - pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset), - &offset); - pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), - &length); - - if (length <= start) { - dev_err(&dev->dev, - "virtio_pci: bad capability len %u (>%u expected)\n", - length, start); - return NULL; - } - - if (length - start < minlen) { - dev_err(&dev->dev, - "virtio_pci: bad capability len %u (>=%zu expected)\n", - length, minlen); - return NULL; - } - - length -= start; - - if (start + offset < offset) { - dev_err(&dev->dev, - "virtio_pci: map wrap-around %u+%u\n", - start, offset); - return NULL; - } - - offset += start; - - if (offset & (align - 1)) { - dev_err(&dev->dev, - "virtio_pci: offset %u not aligned to %u\n", - offset, align); - return NULL; - } - - if (length > size) - length = size; - - if (len) - *len = length; - - if (minlen + offset < minlen || - minlen + offset > pci_resource_len(dev, bar)) { - dev_err(&dev->dev, - "virtio_pci: map virtio %zu@%u " - "out of range on bar %i length %lu\n", - minlen, offset, - bar, (unsigned long)pci_resource_len(dev, bar)); - return NULL; - } - - p = pci_iomap_range(dev, bar, offset, length); - if (!p) - dev_err(&dev->dev, - "virtio_pci: unable to map virtio %u@%u on bar %i\n", - length, offset, bar); - return p; -} - -/* - * vp_modern_get_features - get features from device - * @mdev: the modern virtio-pci device - * - * Returns the features read from the device - */ -static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - u64 features; - - vp_iowrite32(0, &cfg->device_feature_select); - features = vp_ioread32(&cfg->device_feature); - vp_iowrite32(1, &cfg->device_feature_select); - features |= ((u64)vp_ioread32(&cfg->device_feature) << 32); - - return features; -} - -/* virtio config->get_features() implementation */ static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -188,149 +36,6 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); } -/* - * vp_modern_set_features - set features to device - * @mdev: the modern virtio-pci device - * @features: the features set to device - */ -static void vp_modern_set_features(struct virtio_pci_modern_device *mdev, - u64 features) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - vp_iowrite32(0, &cfg->guest_feature_select); - vp_iowrite32((u32)features, &cfg->guest_feature); - vp_iowrite32(1, &cfg->guest_feature_select); - vp_iowrite32(features >> 32, &cfg->guest_feature); -} - -/* - * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue - * @mdev: the modern virtio-pci device - * @index: queue index - * @vector: the config vector - * - * Returns the config vector read from the device - */ -static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, - u16 index, u16 vector) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - vp_iowrite16(index, &cfg->queue_select); - vp_iowrite16(vector, &cfg->queue_msix_vector); - /* Flush the write out to device */ - return vp_ioread16(&cfg->queue_msix_vector); -} - -/* - * vp_modern_queue_address - set the virtqueue address - * @mdev: the modern virtio-pci device - * @index: the queue index - * @desc_addr: address of the descriptor area - * @driver_addr: address of the driver area - * @device_addr: address of the device area - */ -static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, - u16 index, u64 desc_addr, u64 driver_addr, - u64 device_addr) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - vp_iowrite16(index, &cfg->queue_select); - - vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, - &cfg->queue_desc_hi); - vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, - &cfg->queue_avail_hi); - vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, - &cfg->queue_used_hi); -} - -/* - * vp_modern_set_queue_enable - enable a virtqueue - * @mdev: the modern virtio-pci device - * @index: the queue index - * @enable: whether the virtqueue is enable or not - */ -static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, - u16 index, bool enable) -{ - vp_iowrite16(index, &mdev->common->queue_select); - vp_iowrite16(enable, &mdev->common->queue_enable); -} - -/* - * vp_modern_get_queue_enable - enable a virtqueue - * @mdev: the modern virtio-pci device - * @index: the queue index - * - * Returns whether a virtqueue is enabled or not - */ -static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, - u16 index) -{ - vp_iowrite16(index, &mdev->common->queue_select); - - return vp_ioread16(&mdev->common->queue_enable); -} - -/* - * vp_modern_set_queue_size - set size for a virtqueue - * @mdev: the modern virtio-pci device - * @index: the queue index - * @size: the size of the virtqueue - */ -static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, - u16 index, u16 size) -{ - vp_iowrite16(index, &mdev->common->queue_select); - vp_iowrite16(size, &mdev->common->queue_size); - -} - -/* - * vp_modern_get_queue_size - get size for a virtqueue - * @mdev: the modern virtio-pci device - * @index: the queue index - * - * Returns the size of the virtqueue - */ -static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, - u16 index) -{ - vp_iowrite16(index, &mdev->common->queue_select); - - return vp_ioread16(&mdev->common->queue_size); - -} - -/* - * vp_modern_get_num_queues - get the number of virtqueues - * @mdev: the modern virtio-pci device - * - * Returns the number of virtqueues - */ -static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) -{ - return vp_ioread16(&mdev->common->num_queues); -} - -/* - * vp_modern_get_queue_notify_off - get notification offset for a virtqueue - * @mdev: the modern virtio-pci device - * @index: the queue index - * - * Returns the notification offset for a virtqueue - */ -static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, - u16 index) -{ - vp_iowrite16(index, &mdev->common->queue_select); - - return vp_ioread16(&mdev->common->queue_notify_off); -} - /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -429,19 +134,6 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, } } -/* - * vp_modern_generation - get the device genreation - * @mdev: the modern virtio-pci device - * - * Returns the genreation read from device - */ -static u32 vp_modern_generation(struct virtio_pci_modern_device *mdev) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - return vp_ioread8(&cfg->config_generation); -} - static u32 vp_generation(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -449,19 +141,6 @@ static u32 vp_generation(struct virtio_device *vdev) return vp_modern_generation(&vp_dev->mdev); } -/* - * vp_modern_get_status - get the device status - * @mdev: the modern virtio-pci device - * - * Returns the status read from device - */ -static u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - return vp_ioread8(&cfg->device_status); -} - /* config->{get,set}_status() implementations */ static u8 vp_get_status(struct virtio_device *vdev) { @@ -470,19 +149,6 @@ static u8 vp_get_status(struct virtio_device *vdev) return vp_modern_get_status(&vp_dev->mdev); } -/* - * vp_modern_set_status - set status to device - * @mdev: the modern virtio-pci device - * @status: the status set to device - */ -static void vp_modern_set_status(struct virtio_pci_modern_device *mdev, - u8 status) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - vp_iowrite8(status, &cfg->device_status); -} - static void vp_set_status(struct virtio_device *vdev, u8 status) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -510,25 +176,6 @@ static void vp_reset(struct virtio_device *vdev) vp_synchronize_vectors(vdev); } -/* - * vp_modern_config_vector - set the vector for config interrupt - * @mdev: the modern virtio-pci device - * @vector: the config vector - * - * Returns the config vector read from the device - */ -static u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev, - u16 vector) -{ - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; - - /* Setup the vector used for configuration events */ - vp_iowrite16(vector, &cfg->msix_config); - /* Verify we had enough resources to assign the vector */ - /* Will also flush the write out to device */ - return vp_ioread16(&cfg->msix_config); -} - static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) { return vp_modern_config_vector(&vp_dev->mdev, vector); @@ -789,253 +436,6 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .get_shm_region = vp_get_shm_region, }; -/** - * virtio_pci_find_capability - walk capabilities to find device info. - * @dev: the pci device - * @cfg_type: the VIRTIO_PCI_CAP_* value we seek - * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO. - * @bars: the bitmask of BARs - * - * Returns offset of the capability, or 0. - */ -static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, - u32 ioresource_types, int *bars) -{ - int pos; - - for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); - pos > 0; - pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { - u8 type, bar; - pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, - cfg_type), - &type); - pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, - bar), - &bar); - - /* Ignore structures with reserved BAR values */ - if (bar > 0x5) - continue; - - if (type == cfg_type) { - if (pci_resource_len(dev, bar) && - pci_resource_flags(dev, bar) & ioresource_types) { - *bars |= (1 << bar); - return pos; - } - } - } - return 0; -} - -/* This is part of the ABI. Don't screw with it. */ -static inline void check_offsets(void) -{ - /* Note: disk space was harmed in compilation of this function. */ - BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR != - offsetof(struct virtio_pci_cap, cap_vndr)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT != - offsetof(struct virtio_pci_cap, cap_next)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN != - offsetof(struct virtio_pci_cap, cap_len)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE != - offsetof(struct virtio_pci_cap, cfg_type)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR != - offsetof(struct virtio_pci_cap, bar)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET != - offsetof(struct virtio_pci_cap, offset)); - BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH != - offsetof(struct virtio_pci_cap, length)); - BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT != - offsetof(struct virtio_pci_notify_cap, - notify_off_multiplier)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != - offsetof(struct virtio_pci_common_cfg, - device_feature_select)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != - offsetof(struct virtio_pci_common_cfg, device_feature)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != - offsetof(struct virtio_pci_common_cfg, - guest_feature_select)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != - offsetof(struct virtio_pci_common_cfg, guest_feature)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != - offsetof(struct virtio_pci_common_cfg, msix_config)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != - offsetof(struct virtio_pci_common_cfg, num_queues)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != - offsetof(struct virtio_pci_common_cfg, device_status)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION != - offsetof(struct virtio_pci_common_cfg, config_generation)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != - offsetof(struct virtio_pci_common_cfg, queue_select)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != - offsetof(struct virtio_pci_common_cfg, queue_size)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != - offsetof(struct virtio_pci_common_cfg, queue_msix_vector)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != - offsetof(struct virtio_pci_common_cfg, queue_enable)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != - offsetof(struct virtio_pci_common_cfg, queue_notify_off)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != - offsetof(struct virtio_pci_common_cfg, queue_desc_lo)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != - offsetof(struct virtio_pci_common_cfg, queue_desc_hi)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != - offsetof(struct virtio_pci_common_cfg, queue_avail_lo)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != - offsetof(struct virtio_pci_common_cfg, queue_avail_hi)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != - offsetof(struct virtio_pci_common_cfg, queue_used_lo)); - BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != - offsetof(struct virtio_pci_common_cfg, queue_used_hi)); -} - -/* - * vp_modern_probe: probe the modern virtio pci device, note that the - * caller is required to enable PCI device before calling this function. - * @mdev: the modern virtio-pci device - * - * Return 0 on succeed otherwise fail - */ -static int vp_modern_probe(struct virtio_pci_modern_device *mdev) -{ - struct pci_dev *pci_dev = mdev->pci_dev; - int err, common, isr, notify, device; - u32 notify_length; - u32 notify_offset; - - check_offsets(); - - mdev->pci_dev = pci_dev; - - /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ - if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) - return -ENODEV; - - if (pci_dev->device < 0x1040) { - /* Transitional devices: use the PCI subsystem device id as - * virtio device id, same as legacy driver always did. - */ - mdev->id.device = pci_dev->subsystem_device; - } else { - /* Modern devices: simply use PCI device id, but start from 0x1040. */ - mdev->id.device = pci_dev->device - 0x1040; - } - mdev->id.vendor = pci_dev->subsystem_vendor; - - /* check for a common config: if not, use legacy mode (bar 0). */ - common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, - IORESOURCE_IO | IORESOURCE_MEM, - &mdev->modern_bars); - if (!common) { - dev_info(&pci_dev->dev, - "virtio_pci: leaving for legacy driver\n"); - return -ENODEV; - } - - /* If common is there, these should be too... */ - isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, - IORESOURCE_IO | IORESOURCE_MEM, - &mdev->modern_bars); - notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, - IORESOURCE_IO | IORESOURCE_MEM, - &mdev->modern_bars); - if (!isr || !notify) { - dev_err(&pci_dev->dev, - "virtio_pci: missing capabilities %i/%i/%i\n", - common, isr, notify); - return -EINVAL; - } - - err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); - if (err) - err = dma_set_mask_and_coherent(&pci_dev->dev, - DMA_BIT_MASK(32)); - if (err) - dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); - - /* Device capability is only mandatory for devices that have - * device-specific configuration. - */ - device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, - IORESOURCE_IO | IORESOURCE_MEM, - &mdev->modern_bars); - - err = pci_request_selected_regions(pci_dev, mdev->modern_bars, - "virtio-pci-modern"); - if (err) - return err; - - err = -EINVAL; - mdev->common = vp_modern_map_capability(mdev, common, - sizeof(struct virtio_pci_common_cfg), 4, - 0, sizeof(struct virtio_pci_common_cfg), - NULL); - if (!mdev->common) - goto err_map_common; - mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, - 0, 1, - NULL); - if (!mdev->isr) - goto err_map_isr; - - /* Read notify_off_multiplier from config space. */ - pci_read_config_dword(pci_dev, - notify + offsetof(struct virtio_pci_notify_cap, - notify_off_multiplier), - &mdev->notify_offset_multiplier); - /* Read notify length and offset from config space. */ - pci_read_config_dword(pci_dev, - notify + offsetof(struct virtio_pci_notify_cap, - cap.length), - ¬ify_length); - - pci_read_config_dword(pci_dev, - notify + offsetof(struct virtio_pci_notify_cap, - cap.offset), - ¬ify_offset); - - /* We don't know how many VQs we'll map, ahead of the time. - * If notify length is small, map it all now. - * Otherwise, map each VQ individually later. - */ - if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { - mdev->notify_base = vp_modern_map_capability(mdev, notify, - 2, 2, - 0, notify_length, - &mdev->notify_len); - if (!mdev->notify_base) - goto err_map_notify; - } else { - mdev->notify_map_cap = notify; - } - - /* Again, we don't know how much we should map, but PAGE_SIZE - * is more than enough for all existing devices. - */ - if (device) { - mdev->device = vp_modern_map_capability(mdev, device, 0, 4, - 0, PAGE_SIZE, - &mdev->device_len); - if (!mdev->device) - goto err_map_device; - } - - return 0; - -err_map_device: - if (mdev->notify_base) - pci_iounmap(pci_dev, mdev->notify_base); -err_map_notify: - pci_iounmap(pci_dev, mdev->isr); -err_map_isr: - pci_iounmap(pci_dev, mdev->common); -err_map_common: - return err; -} - /* the PCI probing function */ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) { @@ -1063,23 +463,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) return 0; } -/* - * vp_modern_probe: remove and cleanup the modern virtio pci device - * @mdev: the modern virtio-pci device - */ -static void vp_modern_remove(struct virtio_pci_modern_device *mdev) -{ - struct pci_dev *pci_dev = mdev->pci_dev; - - if (mdev->device) - pci_iounmap(pci_dev, mdev->device); - if (mdev->notify_base) - pci_iounmap(pci_dev, mdev->notify_base); - pci_iounmap(pci_dev, mdev->isr); - pci_iounmap(pci_dev, mdev->common); - pci_release_selected_regions(pci_dev, mdev->modern_bars); -} - void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c new file mode 100644 index 000000000000..cbd667496bb1 --- /dev/null +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -0,0 +1,599 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +/* + * vp_modern_map_capability - map a part of virtio pci capability + * @mdev: the modern virtio-pci device + * @off: offset of the capability + * @minlen: minimal length of the capability + * @align: align requirement + * @start: start from the capability + * @size: map size + * @len: the length that is actually mapped + * + * Returns the io address of for the part of the capability + */ +void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, + size_t minlen, + u32 align, + u32 start, u32 size, + size_t *len) +{ + struct pci_dev *dev = mdev->pci_dev; + u8 bar; + u32 offset, length; + void __iomem *p; + + pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, + bar), + &bar); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset), + &offset); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), + &length); + + if (length <= start) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>%u expected)\n", + length, start); + return NULL; + } + + if (length - start < minlen) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>=%zu expected)\n", + length, minlen); + return NULL; + } + + length -= start; + + if (start + offset < offset) { + dev_err(&dev->dev, + "virtio_pci: map wrap-around %u+%u\n", + start, offset); + return NULL; + } + + offset += start; + + if (offset & (align - 1)) { + dev_err(&dev->dev, + "virtio_pci: offset %u not aligned to %u\n", + offset, align); + return NULL; + } + + if (length > size) + length = size; + + if (len) + *len = length; + + if (minlen + offset < minlen || + minlen + offset > pci_resource_len(dev, bar)) { + dev_err(&dev->dev, + "virtio_pci: map virtio %zu@%u " + "out of range on bar %i length %lu\n", + minlen, offset, + bar, (unsigned long)pci_resource_len(dev, bar)); + return NULL; + } + + p = pci_iomap_range(dev, bar, offset, length); + if (!p) + dev_err(&dev->dev, + "virtio_pci: unable to map virtio %u@%u on bar %i\n", + length, offset, bar); + return p; +} +EXPORT_SYMBOL_GPL(vp_modern_map_capability); + +/** + * virtio_pci_find_capability - walk capabilities to find device info. + * @dev: the pci device + * @cfg_type: the VIRTIO_PCI_CAP_* value we seek + * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO. + * @bars: the bitmask of BARs + * + * Returns offset of the capability, or 0. + */ +static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, + u32 ioresource_types, int *bars) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, bar; + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), + &type); + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), + &bar); + + /* Ignore structures with reserved BAR values */ + if (bar > 0x5) + continue; + + if (type == cfg_type) { + if (pci_resource_len(dev, bar) && + pci_resource_flags(dev, bar) & ioresource_types) { + *bars |= (1 << bar); + return pos; + } + } + } + return 0; +} + +/* This is part of the ABI. Don't screw with it. */ +static inline void check_offsets(void) +{ + /* Note: disk space was harmed in compilation of this function. */ + BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR != + offsetof(struct virtio_pci_cap, cap_vndr)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT != + offsetof(struct virtio_pci_cap, cap_next)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN != + offsetof(struct virtio_pci_cap, cap_len)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE != + offsetof(struct virtio_pci_cap, cfg_type)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR != + offsetof(struct virtio_pci_cap, bar)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET != + offsetof(struct virtio_pci_cap, offset)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH != + offsetof(struct virtio_pci_cap, length)); + BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT != + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != + offsetof(struct virtio_pci_common_cfg, + device_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != + offsetof(struct virtio_pci_common_cfg, device_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != + offsetof(struct virtio_pci_common_cfg, + guest_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != + offsetof(struct virtio_pci_common_cfg, guest_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != + offsetof(struct virtio_pci_common_cfg, msix_config)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != + offsetof(struct virtio_pci_common_cfg, num_queues)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != + offsetof(struct virtio_pci_common_cfg, device_status)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION != + offsetof(struct virtio_pci_common_cfg, config_generation)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != + offsetof(struct virtio_pci_common_cfg, queue_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != + offsetof(struct virtio_pci_common_cfg, queue_size)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != + offsetof(struct virtio_pci_common_cfg, queue_msix_vector)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != + offsetof(struct virtio_pci_common_cfg, queue_enable)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != + offsetof(struct virtio_pci_common_cfg, queue_notify_off)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != + offsetof(struct virtio_pci_common_cfg, queue_desc_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != + offsetof(struct virtio_pci_common_cfg, queue_desc_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != + offsetof(struct virtio_pci_common_cfg, queue_avail_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != + offsetof(struct virtio_pci_common_cfg, queue_avail_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != + offsetof(struct virtio_pci_common_cfg, queue_used_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != + offsetof(struct virtio_pci_common_cfg, queue_used_hi)); +} + +/* + * vp_modern_probe: probe the modern virtio pci device, note that the + * caller is required to enable PCI device before calling this function. + * @mdev: the modern virtio-pci device + * + * Return 0 on succeed otherwise fail + */ +int vp_modern_probe(struct virtio_pci_modern_device *mdev) +{ + struct pci_dev *pci_dev = mdev->pci_dev; + int err, common, isr, notify, device; + u32 notify_length; + u32 notify_offset; + + check_offsets(); + + mdev->pci_dev = pci_dev; + + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) + return -ENODEV; + + if (pci_dev->device < 0x1040) { + /* Transitional devices: use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + mdev->id.device = pci_dev->subsystem_device; + } else { + /* Modern devices: simply use PCI device id, but start from 0x1040. */ + mdev->id.device = pci_dev->device - 0x1040; + } + mdev->id.vendor = pci_dev->subsystem_vendor; + + /* check for a common config: if not, use legacy mode (bar 0). */ + common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + if (!common) { + dev_info(&pci_dev->dev, + "virtio_pci: leaving for legacy driver\n"); + return -ENODEV; + } + + /* If common is there, these should be too... */ + isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + if (!isr || !notify) { + dev_err(&pci_dev->dev, + "virtio_pci: missing capabilities %i/%i/%i\n", + common, isr, notify); + return -EINVAL; + } + + err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&pci_dev->dev, + DMA_BIT_MASK(32)); + if (err) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + + /* Device capability is only mandatory for devices that have + * device-specific configuration. + */ + device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, + IORESOURCE_IO | IORESOURCE_MEM, + &mdev->modern_bars); + + err = pci_request_selected_regions(pci_dev, mdev->modern_bars, + "virtio-pci-modern"); + if (err) + return err; + + err = -EINVAL; + mdev->common = vp_modern_map_capability(mdev, common, + sizeof(struct virtio_pci_common_cfg), 4, + 0, sizeof(struct virtio_pci_common_cfg), + NULL); + if (!mdev->common) + goto err_map_common; + mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, + 0, 1, + NULL); + if (!mdev->isr) + goto err_map_isr; + + /* Read notify_off_multiplier from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier), + &mdev->notify_offset_multiplier); + /* Read notify length and offset from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.length), + ¬ify_length); + + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.offset), + ¬ify_offset); + + /* We don't know how many VQs we'll map, ahead of the time. + * If notify length is small, map it all now. + * Otherwise, map each VQ individually later. + */ + if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) { + mdev->notify_base = vp_modern_map_capability(mdev, notify, + 2, 2, + 0, notify_length, + &mdev->notify_len); + if (!mdev->notify_base) + goto err_map_notify; + } else { + mdev->notify_map_cap = notify; + } + + /* Again, we don't know how much we should map, but PAGE_SIZE + * is more than enough for all existing devices. + */ + if (device) { + mdev->device = vp_modern_map_capability(mdev, device, 0, 4, + 0, PAGE_SIZE, + &mdev->device_len); + if (!mdev->device) + goto err_map_device; + } + + return 0; + +err_map_device: + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); +err_map_notify: + pci_iounmap(pci_dev, mdev->isr); +err_map_isr: + pci_iounmap(pci_dev, mdev->common); +err_map_common: + return err; +} +EXPORT_SYMBOL_GPL(vp_modern_probe); + +/* + * vp_modern_probe: remove and cleanup the modern virtio pci device + * @mdev: the modern virtio-pci device + */ +void vp_modern_remove(struct virtio_pci_modern_device *mdev) +{ + struct pci_dev *pci_dev = mdev->pci_dev; + + if (mdev->device) + pci_iounmap(pci_dev, mdev->device); + if (mdev->notify_base) + pci_iounmap(pci_dev, mdev->notify_base); + pci_iounmap(pci_dev, mdev->isr); + pci_iounmap(pci_dev, mdev->common); + pci_release_selected_regions(pci_dev, mdev->modern_bars); +} +EXPORT_SYMBOL_GPL(vp_modern_remove); + +/* + * vp_modern_get_features - get features from device + * @mdev: the modern virtio-pci device + * + * Returns the features read from the device + */ +u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + u64 features; + + vp_iowrite32(0, &cfg->device_feature_select); + features = vp_ioread32(&cfg->device_feature); + vp_iowrite32(1, &cfg->device_feature_select); + features |= ((u64)vp_ioread32(&cfg->device_feature) << 32); + + return features; +} +EXPORT_SYMBOL_GPL(vp_modern_get_features); + +/* + * vp_modern_set_features - set features to device + * @mdev: the modern virtio-pci device + * @features: the features set to device + */ +void vp_modern_set_features(struct virtio_pci_modern_device *mdev, + u64 features) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite32(0, &cfg->guest_feature_select); + vp_iowrite32((u32)features, &cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + vp_iowrite32(features >> 32, &cfg->guest_feature); +} +EXPORT_SYMBOL_GPL(vp_modern_set_features); + +/* + * vp_modern_generation - get the device genreation + * @mdev: the modern virtio-pci device + * + * Returns the genreation read from device + */ +u32 vp_modern_generation(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + return vp_ioread8(&cfg->config_generation); +} +EXPORT_SYMBOL_GPL(vp_modern_generation); + +/* + * vp_modern_get_status - get the device status + * @mdev: the modern virtio-pci device + * + * Returns the status read from device + */ +u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + return vp_ioread8(&cfg->device_status); +} +EXPORT_SYMBOL_GPL(vp_modern_get_status); + +/* + * vp_modern_set_status - set status to device + * @mdev: the modern virtio-pci device + * @status: the status set to device + */ +void vp_modern_set_status(struct virtio_pci_modern_device *mdev, + u8 status) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite8(status, &cfg->device_status); +} +EXPORT_SYMBOL_GPL(vp_modern_set_status); + +/* + * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue + * @mdev: the modern virtio-pci device + * @index: queue index + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, + u16 index, u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + vp_iowrite16(vector, &cfg->queue_msix_vector); + /* Flush the write out to device */ + return vp_ioread16(&cfg->queue_msix_vector); +} +EXPORT_SYMBOL_GPL(vp_modern_queue_vector); + +/* + * vp_modern_config_vector - set the vector for config interrupt + * @mdev: the modern virtio-pci device + * @vector: the config vector + * + * Returns the config vector read from the device + */ +u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev, + u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + /* Setup the vector used for configuration events */ + vp_iowrite16(vector, &cfg->msix_config); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return vp_ioread16(&cfg->msix_config); +} +EXPORT_SYMBOL_GPL(vp_modern_config_vector); + +/* + * vp_modern_queue_address - set the virtqueue address + * @mdev: the modern virtio-pci device + * @index: the queue index + * @desc_addr: address of the descriptor area + * @driver_addr: address of the driver area + * @device_addr: address of the device area + */ +void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, + u16 index, u64 desc_addr, u64 driver_addr, + u64 device_addr) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, &cfg->queue_select); + + vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, + &cfg->queue_used_hi); +} +EXPORT_SYMBOL_GPL(vp_modern_queue_address); + +/* + * vp_modern_set_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @enable: whether the virtqueue is enable or not + */ +void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index, bool enable) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(enable, &mdev->common->queue_enable); +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_enable); + +/* + * vp_modern_get_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns whether a virtqueue is enabled or not + */ +bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_enable); +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_enable); + +/* + * vp_modern_set_queue_size - set size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @size: the size of the virtqueue + */ +void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, + u16 index, u16 size) +{ + vp_iowrite16(index, &mdev->common->queue_select); + vp_iowrite16(size, &mdev->common->queue_size); + +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_size); + +/* + * vp_modern_get_queue_size - get size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the size of the virtqueue + */ +u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_size); + +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_size); + +/* + * vp_modern_get_num_queues - get the number of virtqueues + * @mdev: the modern virtio-pci device + * + * Returns the number of virtqueues + */ +u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) +{ + return vp_ioread16(&mdev->common->num_queues); +} +EXPORT_SYMBOL_GPL(vp_modern_get_num_queues); + +/* + * vp_modern_get_queue_notify_off - get notification offset for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the notification offset for a virtqueue + */ +u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, &mdev->common->queue_select); + + return vp_ioread16(&mdev->common->queue_notify_off); +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off); + +MODULE_VERSION("0.1"); +MODULE_DESCRIPTION("Modern Virtio PCI Device"); +MODULE_AUTHOR("Jason Wang "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h new file mode 100644 index 000000000000..f26acbeec965 --- /dev/null +++ b/include/linux/virtio_pci_modern.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VIRTIO_PCI_MODERN_H +#define _LINUX_VIRTIO_PCI_MODERN_H + +#include +#include + +struct virtio_pci_modern_device { + struct pci_dev *pci_dev; + + struct virtio_pci_common_cfg __iomem *common; + /* Device-specific data (non-legacy mode) */ + void __iomem *device; + /* Base of vq notifications (non-legacy mode). */ + void __iomem *notify_base; + /* Where to read and clear interrupt */ + u8 __iomem *isr; + + /* So we can sanity-check accesses. */ + size_t notify_len; + size_t device_len; + + /* Capability for when we need to map notifications per-vq. */ + int notify_map_cap; + + /* Multiply queue_notify_off by this value. (non-legacy mode). */ + u32 notify_offset_multiplier; + + int modern_bars; + + struct virtio_device_id id; +}; + +/* + * Type-safe wrappers for io accesses. + * Use these to enforce at compile time the following spec requirement: + * + * The driver MUST access each field using the “natural” access + * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses + * for 16-bit fields and 8-bit accesses for 8-bit fields. + */ +static inline u8 vp_ioread8(const u8 __iomem *addr) +{ + return ioread8(addr); +} +static inline u16 vp_ioread16 (const __le16 __iomem *addr) +{ + return ioread16(addr); +} + +static inline u32 vp_ioread32(const __le32 __iomem *addr) +{ + return ioread32(addr); +} + +static inline void vp_iowrite8(u8 value, u8 __iomem *addr) +{ + iowrite8(value, addr); +} + +static inline void vp_iowrite16(u16 value, __le16 __iomem *addr) +{ + iowrite16(value, addr); +} + +static inline void vp_iowrite32(u32 value, __le32 __iomem *addr) +{ + iowrite32(value, addr); +} + +static inline void vp_iowrite64_twopart(u64 val, + __le32 __iomem *lo, + __le32 __iomem *hi) +{ + vp_iowrite32((u32)val, lo); + vp_iowrite32(val >> 32, hi); +} + +u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev); +void vp_modern_set_features(struct virtio_pci_modern_device *mdev, + u64 features); +u32 vp_modern_generation(struct virtio_pci_modern_device *mdev); +u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev); +void vp_modern_set_status(struct virtio_pci_modern_device *mdev, + u8 status); +u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, + u16 idx, u16 vector); +u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev, + u16 vector); +void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, + u16 index, u64 desc_addr, u64 driver_addr, + u64 device_addr); +void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, + u16 idx, bool enable); +bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, + u16 idx); +void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, + u16 idx, u16 size); +u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, + u16 idx); +u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); +u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 idx); +void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, + size_t minlen, + u32 align, + u32 start, u32 size, + size_t *len); +int vp_modern_probe(struct virtio_pci_modern_device *mdev); +void vp_modern_remove(struct virtio_pci_modern_device *mdev); +#endif From 1628c6877f371194b603330c324828d03e0eacda Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Jan 2021 14:55:02 +0800 Subject: [PATCH 308/506] virtio_vdpa: don't warn when fail to disable vq There's no guarantee that the device can disable a specific virtqueue through set_vq_ready(). One example is the modern virtio-pci device. So this patch removes the warning. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210104065503.199631-19-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_vdpa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 4a9ddb44b2a7..e28acf482e0c 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -225,9 +225,8 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq) list_del(&info->node); spin_unlock_irqrestore(&vd_dev->lock, flags); - /* Select and deactivate the queue */ + /* Select and deactivate the queue (best effort) */ ops->set_vq_ready(vdpa, index, 0); - WARN_ON(ops->get_vq_ready(vdpa, index)); vring_del_virtqueue(vq); From d1e9aa9c34a776d43de819d3d334833aff5cca5a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 22 Jan 2021 17:21:46 +0800 Subject: [PATCH 309/506] virtio-blk: support per-device queue depth module parameter 'virtblk_queue_depth' was firstly introduced for testing/benchmarking purposes described in commit fc4324b4597c ("virtio-blk: base queue-depth on virtqueue ringsize or module param"). And currently 'virtblk_queue_depth' is used as a saved value for the first probed device. Since we have different virtio-blk devices which have different capabilities, it requires that we support per-device queue depth instead of per-module. So defaultly use vq free elements if module parameter 'virtblk_queue_depth' is not set. Signed-off-by: Joseph Qi Acked-by: Jason Wang Link: https://lore.kernel.org/r/1611307306-71067-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- drivers/block/virtio_blk.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 145606dc52db..fd168a2edc1a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -705,6 +705,7 @@ static int virtblk_probe(struct virtio_device *vdev) u32 v, blk_size, max_size, sg_elems, opt_io_size; u16 min_io_size; u8 physical_block_exp, alignment_offset; + unsigned int queue_depth; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", @@ -756,16 +757,18 @@ static int virtblk_probe(struct virtio_device *vdev) } /* Default queue sizing is to fill the ring. */ - if (!virtblk_queue_depth) { - virtblk_queue_depth = vblk->vqs[0].vq->num_free; + if (likely(!virtblk_queue_depth)) { + queue_depth = vblk->vqs[0].vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - virtblk_queue_depth /= 2; + queue_depth /= 2; + } else { + queue_depth = virtblk_queue_depth; } memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); vblk->tag_set.ops = &virtio_mq_ops; - vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.queue_depth = queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = From a6829c350ec6bec47752826c9738122308e1588c Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Tue, 2 Feb 2021 08:19:23 +0800 Subject: [PATCH 310/506] virtio_input: Prevent EV_MSC/MSC_TIMESTAMP loop storm for MT. In 'commit 29cc309d8bf1 ("HID: hid-multitouch: forward MSC_TIMESTAMP")', EV_MSC/MSC_TIMESTAMP is added to each before EV_SYN event. EV_MSC is configured as INPUT_PASS_TO_ALL. In case of a touch device which report MSC_TIMESTAMP: BE pass EV_MSC/MSC_TIMESTAMP to FE on receiving event from evdev. FE pass EV_MSC/MSC_TIMESTAMP back to BE. BE writes EV_MSC/MSC_TIMESTAMP to evdev due to INPUT_PASS_TO_ALL. BE receives extra EV_MSC/MSC_TIMESTAMP and pass to FE. >>> Each new frame becomes larger and larger. Disable EV_MSC/MSC_TIMESTAMP forwarding for MT. V2: Rebase. Signed-off-by: Colin Xu Link: https://lore.kernel.org/r/20210202001923.6227-1-colin.xu@intel.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Gerd Hoffmann --- drivers/virtio/virtio_input.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index f1f6208edcf5..244965c20d9b 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -64,6 +64,21 @@ static int virtinput_send_status(struct virtio_input *vi, unsigned long flags; int rc; + /* + * Since 29cc309d8bf1 (HID: hid-multitouch: forward MSC_TIMESTAMP), + * EV_MSC/MSC_TIMESTAMP is added to each before EV_SYN event. + * EV_MSC is configured as INPUT_PASS_TO_ALL. + * In case of touch device: + * BE pass EV_MSC/MSC_TIMESTAMP to FE on receiving event from evdev. + * FE pass EV_MSC/MSC_TIMESTAMP back to BE. + * BE writes EV_MSC/MSC_TIMESTAMP to evdev due to INPUT_PASS_TO_ALL. + * BE receives extra EV_MSC/MSC_TIMESTAMP and pass to FE. + * >>> Each new frame becomes larger and larger. + * Disable EV_MSC/MSC_TIMESTAMP forwarding for MT. + */ + if (vi->idev->mt && type == EV_MSC && code == MSC_TIMESTAMP) + return 0; + stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC); if (!stsbuf) return -ENOMEM; From 95efabf077babf09ea148f941729e953ac185d8a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:40:44 -0600 Subject: [PATCH 311/506] virtio_net: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a goto statement instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/cb9b9534572bc476f4fb7b49a73dc8646b780c84.1605896060.git.gustavoars@kernel.org Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 508408fbe78f..1d8373773a90 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -732,6 +732,7 @@ static struct sk_buff *receive_small(struct net_device *dev, fallthrough; case XDP_ABORTED: trace_xdp_exception(vi->dev, xdp_prog, act); + goto err_xdp; case XDP_DROP: goto err_xdp; } From dcfde1635e764fd69cc756c7780d144e288608e9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 8 Feb 2021 17:17:41 +0100 Subject: [PATCH 312/506] vdpa/mlx5: fix param validation in mlx5_vdpa_get_config() It's legal to have 'offset + len' equal to sizeof(struct virtio_net_config), since 'ndev->config' is a 'struct virtio_net_config', so we can safely copy its content under this condition. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210208161741.104939-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Eli Cohen --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index dc88559a8d49..10e9b09932eb 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1820,7 +1820,7 @@ static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - if (offset + len < sizeof(struct virtio_net_config)) + if (offset + len <= sizeof(struct virtio_net_config)) memcpy(buf, (u8 *)&ndev->config + offset, len); } From 0c4aeb4b5087f09ef61ff362394202ab7cf9f3e2 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 6 Feb 2021 07:46:59 -0500 Subject: [PATCH 313/506] virtio_mmio: fix one typo fix the typo 'there is are' to 'there are'. Signed-off-by: Xianting Tian Link: https://lore.kernel.org/r/1612615619-8128-1-git-send-email-xianting_tian@126.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 238383ff1064..a286d22b6551 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -126,7 +126,7 @@ static int vm_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* Make sure there is are no mixed devices */ + /* Make sure there are no mixed devices */ if (vm_dev->version == 2 && !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n"); From 16c10bede8b3d8594279752bf53153491f3f944f Mon Sep 17 00:00:00 2001 From: Mathias Crombez Date: Fri, 15 Jan 2021 02:26:23 +0200 Subject: [PATCH 314/506] virtio-input: add multi-touch support Without multi-touch slots allocated, ABS_MT_SLOT events will be lost by input_handle_abs_event. Implementation is based on uinput_create_device. Signed-off-by: Mathias Crombez Co-developed-by: Vasyl Vavrychuk Signed-off-by: Vasyl Vavrychuk Link: https://lore.kernel.org/r/20210115002623.8576-1-vasyl.vavrychuk@opensynergy.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Gerd Hoffmann --- drivers/virtio/virtio_input.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index 244965c20d9b..ce51ae165943 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -7,6 +7,7 @@ #include #include +#include struct virtio_input { struct virtio_device *vdev; @@ -219,7 +220,7 @@ static int virtinput_probe(struct virtio_device *vdev) struct virtio_input *vi; unsigned long flags; size_t size; - int abs, err; + int abs, err, nslots; if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) return -ENODEV; @@ -304,6 +305,13 @@ static int virtinput_probe(struct virtio_device *vdev) continue; virtinput_cfg_abs(vi, abs); } + + if (test_bit(ABS_MT_SLOT, vi->idev->absbit)) { + nslots = input_abs_get_max(vi->idev, ABS_MT_SLOT) + 1; + err = input_mt_init_slots(vi->idev, nslots, 0); + if (err) + goto err_mt_init_slots; + } } virtio_device_ready(vdev); @@ -319,6 +327,7 @@ static int virtinput_probe(struct virtio_device *vdev) spin_lock_irqsave(&vi->lock, flags); vi->ready = false; spin_unlock_irqrestore(&vi->lock, flags); +err_mt_init_slots: input_free_device(vi->idev); err_input_alloc: vdev->config->del_vqs(vdev); From 34b07d47dd003168556a1774558240fefbb9b461 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 16 Feb 2021 14:29:54 -0700 Subject: [PATCH 315/506] drm/i915: Enable -Wuninitialized -Wunintialized was disabled in commit c5627461490e ("drm/i915: Disable -Wuninitialized") because there were two warnings that were false positives. The first was due to DECLARE_WAIT_QUEUE_HEAD_ONSTACK, which was fixed in LLVM 9.0.0. The second was in busywait_stop, which was fixed in LLVM 10.0.0 (issue 415). The kernel's minimum version for LLVM is 10.0.1 so this warning can be safely enabled, where it has already caught a couple bugs. Link: https://github.com/ClangBuiltLinux/linux/issues/220 Link: https://github.com/ClangBuiltLinux/linux/issues/415 Link: https://github.com/ClangBuiltLinux/linux/issues/499 Link: https://github.com/llvm/llvm-project/commit/2e040398f8d691cc378c1abb098824ff49f3f28f Link: https://github.com/llvm/llvm-project/commit/c667cdc850c2aa821ffeedbc08c24bc985c59edd Fixes: c5627461490e ("drm/i915: Disable -Wuninitialized") References: 2ea4a7ba9bf6 ("drm/i915/gt: Avoid uninitialized use of rpcurupei in frequency_show") References: 2034c2129bc4 ("drm/i915/display: Ensure that ret is always initialized in icl_combo_phy_verify_state") Reported-by: Arnd Bergmann Signed-off-by: Nathan Chancellor Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20210216212953.24458-1-nathan@kernel.org (cherry picked from commit b2423184ac3352a52fc7562fa0e7d23435fe67b9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 32bd1fdffffe..dae6e93fc1e5 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -21,7 +21,6 @@ subdir-ccflags-y += $(call cc-disable-warning, unused-but-set-variable) subdir-ccflags-y += $(call cc-disable-warning, sign-compare) subdir-ccflags-y += $(call cc-disable-warning, sometimes-uninitialized) subdir-ccflags-y += $(call cc-disable-warning, initializer-overrides) -subdir-ccflags-y += $(call cc-disable-warning, uninitialized) subdir-ccflags-y += $(call cc-disable-warning, frame-address) subdir-ccflags-$(CONFIG_DRM_I915_WERROR) += -Werror From ed428ffc28521b9f1b6a71584d1875318a122859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Feb 2021 22:23:22 +0200 Subject: [PATCH 316/506] drm/i915: Nuke INTEL_OUTPUT_FORMAT_INVALID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We tend to use output_format!=RGB as a shorthand for YCbCr, but this fails if we have a disabled crtc where output_format==INVALID. We're now getting some fail from intel_color_check() when we have: hw.enable==false hw.ctm!=NULL output_format==INVALID Let's avoid that by throwing INTEL_OUTPUT_FORMAT_INVALID to the dumpster, and thus everything defaults to RGB when the crtc is disabled. This does beg the deeper question of how much of the state should we in fact be validating when hw/uapi.enable==false. And should we even be doing the uapi->hw copy when uapi.enable==false? So far I've not been able to come up with satisfactory answers for myself, so I'm putting it off for the moment. Cc: Lee Shawn C Fixes: 0aa5c3835c8a ("drm/i915: support two CSC module on gen11 and later") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2964 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210205202322.27608-1-ville.syrjala@linux.intel.com Reviewed-by: José Roberto de Souza (cherry picked from commit 7e07c68f06a248441b485249de4c4115cba262cc) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_crtc.c | 1 - drivers/gpu/drm/i915/display/intel_display.c | 3 +-- drivers/gpu/drm/i915/display/intel_display_types.h | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c index 57b0a3ebe908..8e77ca7ddf11 100644 --- a/drivers/gpu/drm/i915/display/intel_crtc.c +++ b/drivers/gpu/drm/i915/display/intel_crtc.c @@ -109,7 +109,6 @@ void intel_crtc_state_reset(struct intel_crtc_state *crtc_state, crtc_state->cpu_transcoder = INVALID_TRANSCODER; crtc_state->master_transcoder = INVALID_TRANSCODER; crtc_state->hsw_workaround_pipe = INVALID_PIPE; - crtc_state->output_format = INTEL_OUTPUT_FORMAT_INVALID; crtc_state->scaler_state.scaler_id = -1; crtc_state->mst_master_transcoder = INVALID_TRANSCODER; } diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 7ea1e5b487b6..8d7aaa68c6f6 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -10211,7 +10211,6 @@ static void snprintf_output_types(char *buf, size_t len, } static const char * const output_format_str[] = { - [INTEL_OUTPUT_FORMAT_INVALID] = "Invalid", [INTEL_OUTPUT_FORMAT_RGB] = "RGB", [INTEL_OUTPUT_FORMAT_YCBCR420] = "YCBCR4:2:0", [INTEL_OUTPUT_FORMAT_YCBCR444] = "YCBCR4:4:4", @@ -10220,7 +10219,7 @@ static const char * const output_format_str[] = { static const char *output_formats(enum intel_output_format format) { if (format >= ARRAY_SIZE(output_format_str)) - format = INTEL_OUTPUT_FORMAT_INVALID; + return "invalid"; return output_format_str[format]; } diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 39397748b4b0..184ecbbcec99 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -830,7 +830,6 @@ struct intel_crtc_wm_state { }; enum intel_output_format { - INTEL_OUTPUT_FORMAT_INVALID, INTEL_OUTPUT_FORMAT_RGB, INTEL_OUTPUT_FORMAT_YCBCR420, INTEL_OUTPUT_FORMAT_YCBCR444, From 06f45fe96fcd81531b0bcb2a6115da563ae6dbd6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 19 Feb 2021 16:40:28 +0100 Subject: [PATCH 317/506] xen/events: add per-xenbus device event statistics and settings Add syfs nodes for each xenbus device showing event statistics (number of events and spurious events, number of associated event channels) and for setting a spurious event threshold in case a frontend is sending too many events without being rogue on purpose. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210219154030.10892-7-jgross@suse.com Signed-off-by: Boris Ostrovsky --- .../ABI/testing/sysfs-devices-xenbus | 41 ++++++++++++ drivers/xen/events/events_base.c | 27 +++++++- drivers/xen/xenbus/xenbus_probe.c | 66 +++++++++++++++++++ include/xen/xenbus.h | 7 ++ 4 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-devices-xenbus diff --git a/Documentation/ABI/testing/sysfs-devices-xenbus b/Documentation/ABI/testing/sysfs-devices-xenbus new file mode 100644 index 000000000000..fd796cb4f315 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-xenbus @@ -0,0 +1,41 @@ +What: /sys/devices/*/xenbus/event_channels +Date: February 2021 +Contact: Xen Developers mailing list +Description: + Number of Xen event channels associated with a kernel based + paravirtualized device frontend or backend. + +What: /sys/devices/*/xenbus/events +Date: February 2021 +Contact: Xen Developers mailing list +Description: + Total number of Xen events received for a Xen pv device + frontend or backend. + +What: /sys/devices/*/xenbus/jiffies_eoi_delayed +Date: February 2021 +Contact: Xen Developers mailing list +Description: + Summed up time in jiffies the EOI of an interrupt for a Xen + pv device has been delayed in order to avoid stalls due to + event storms. This value rising is a first sign for a rogue + other end of the pv device. + +What: /sys/devices/*/xenbus/spurious_events +Date: February 2021 +Contact: Xen Developers mailing list +Description: + Number of events received for a Xen pv device which did not + require any action. Too many spurious events in a row will + trigger delayed EOI processing. + +What: /sys/devices/*/xenbus/spurious_threshold +Date: February 2021 +Contact: Xen Developers mailing list +Description: + Controls the tolerated number of subsequent spurious events + before delayed EOI processing is triggered for a Xen pv + device. Default is 1. This can be modified in case the other + end of the pv device is issuing spurious events on a regular + basis and is known not to be malicious on purpose. Raising + the value for such cases can improve pv device performance. diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b249f2d6b0cc..adb7260e94b2 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -323,6 +323,8 @@ static int xen_irq_info_evtchn_setup(unsigned irq, ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); info->u.interdomain = dev; + if (dev) + atomic_inc(&dev->event_channels); return ret; } @@ -568,18 +570,28 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) return; if (spurious) { + struct xenbus_device *dev = info->u.interdomain; + unsigned int threshold = 1; + + if (dev && dev->spurious_threshold) + threshold = dev->spurious_threshold; + if ((1 << info->spurious_cnt) < (HZ << 2)) { if (info->spurious_cnt != 0xFF) info->spurious_cnt++; } - if (info->spurious_cnt > 1) { - delay = 1 << (info->spurious_cnt - 2); + if (info->spurious_cnt > threshold) { + delay = 1 << (info->spurious_cnt - 1 - threshold); if (delay > HZ) delay = HZ; if (!info->eoi_time) info->eoi_cpu = smp_processor_id(); info->eoi_time = get_jiffies_64() + delay; + if (dev) + atomic_add(delay, &dev->jiffies_eoi_delayed); } + if (dev) + atomic_inc(&dev->spurious_events); } else { info->spurious_cnt = 0; } @@ -908,6 +920,7 @@ static void __unbind_from_irq(unsigned int irq) if (VALID_EVTCHN(evtchn)) { unsigned int cpu = cpu_from_irq(irq); + struct xenbus_device *dev; xen_evtchn_close(evtchn); @@ -918,6 +931,11 @@ static void __unbind_from_irq(unsigned int irq) case IRQT_IPI: per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; break; + case IRQT_EVTCHN: + dev = info->u.interdomain; + if (dev) + atomic_dec(&dev->event_channels); + break; default: break; } @@ -1581,6 +1599,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) { int irq; struct irq_info *info; + struct xenbus_device *dev; irq = get_evtchn_to_irq(port); if (irq == -1) @@ -1610,6 +1629,10 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) info = info_for_irq(irq); + dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL; + if (dev) + atomic_inc(&dev->events); + if (ctrl->defer_eoi) { info->eoi_cpu = smp_processor_id(); info->irq_epoch = __this_cpu_read(irq_epoch); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 8a75092bb148..97f0d234482d 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -206,6 +206,65 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, } EXPORT_SYMBOL_GPL(xenbus_otherend_changed); +#define XENBUS_SHOW_STAT(name) \ +static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + \ + return sprintf(buf, "%d\n", atomic_read(&dev->name)); \ +} \ +static DEVICE_ATTR(name, 0444, show_##name, NULL) + +XENBUS_SHOW_STAT(event_channels); +XENBUS_SHOW_STAT(events); +XENBUS_SHOW_STAT(spurious_events); +XENBUS_SHOW_STAT(jiffies_eoi_delayed); + +static ssize_t show_spurious_threshold(struct device *_dev, + struct device_attribute *attr, + char *buf) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + + return sprintf(buf, "%d\n", dev->spurious_threshold); +} + +static ssize_t set_spurious_threshold(struct device *_dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + dev->spurious_threshold = val; + + return count; +} + +static DEVICE_ATTR(spurious_threshold, 0644, show_spurious_threshold, + set_spurious_threshold); + +static struct attribute *xenbus_attrs[] = { + &dev_attr_event_channels.attr, + &dev_attr_events.attr, + &dev_attr_spurious_events.attr, + &dev_attr_jiffies_eoi_delayed.attr, + &dev_attr_spurious_threshold.attr, + NULL +}; + +static const struct attribute_group xenbus_group = { + .name = "xenbus", + .attrs = xenbus_attrs, +}; + int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); @@ -253,6 +312,11 @@ int xenbus_dev_probe(struct device *_dev) return err; } + dev->spurious_threshold = 1; + if (sysfs_create_group(&dev->dev.kobj, &xenbus_group)) + dev_warn(&dev->dev, "sysfs_create_group on %s failed.\n", + dev->nodename); + return 0; fail_put: module_put(drv->driver.owner); @@ -269,6 +333,8 @@ int xenbus_dev_remove(struct device *_dev) DPRINTK("%s", dev->nodename); + sysfs_remove_group(&dev->dev.kobj, &xenbus_group); + free_otherend_watch(dev); if (drv->remove) { diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index bf3cfc7c35d0..0b1386073d49 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -88,6 +88,13 @@ struct xenbus_device { struct completion down; struct work_struct work; struct semaphore reclaim_sem; + + /* Event channel based statistics and settings. */ + atomic_t event_channels; + atomic_t events; + atomic_t spurious_events; + atomic_t jiffies_eoi_delayed; + unsigned int spurious_threshold; }; static inline struct xenbus_device *to_xenbus_device(struct device *dev) From d56699594046d54f32936a1eec337a62c15f931a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 19 Feb 2021 16:40:29 +0100 Subject: [PATCH 318/506] xen/evtchn: use smp barriers for user event ring The ring buffer for user events is local to the given kernel instance, so smp barriers are fine for ensuring consistency. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Link: https://lore.kernel.org/r/20210219154030.10892-8-jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/evtchn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index a7a85719a8c8..421382c73d88 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -173,7 +173,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) if ((u->ring_prod - u->ring_cons) < u->ring_size) { *evtchn_ring_entry(u, u->ring_prod) = evtchn->port; - wmb(); /* Ensure ring contents visible */ + smp_wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); kill_fasync(&u->evtchn_async_queue, @@ -245,7 +245,7 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, } rc = -EFAULT; - rmb(); /* Ensure that we see the port before we copy it. */ + smp_rmb(); /* Ensure that we see the port before we copy it. */ if (copy_to_user(buf, evtchn_ring_entry(u, c), bytes1) || ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) From 6977c0b560f190d0d4786f99d9c120126fe654f2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 19 Feb 2021 16:40:30 +0100 Subject: [PATCH 319/506] xen/evtchn: use READ/WRITE_ONCE() for accessing ring indices For avoiding read- and write-tearing by the compiler use READ_ONCE() and WRITE_ONCE() for accessing the ring indices in evtchn.c. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210219154030.10892-9-jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/evtchn.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 421382c73d88..c99415a70051 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -162,6 +162,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) { struct user_evtchn *evtchn = data; struct per_user_data *u = evtchn->user; + unsigned int prod, cons; WARN(!evtchn->enabled, "Interrupt for port %u, but apparently not enabled; per-user %p\n", @@ -171,10 +172,14 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) spin_lock(&u->ring_prod_lock); - if ((u->ring_prod - u->ring_cons) < u->ring_size) { - *evtchn_ring_entry(u, u->ring_prod) = evtchn->port; + prod = READ_ONCE(u->ring_prod); + cons = READ_ONCE(u->ring_cons); + + if ((prod - cons) < u->ring_size) { + *evtchn_ring_entry(u, prod) = evtchn->port; smp_wmb(); /* Ensure ring contents visible */ - if (u->ring_cons == u->ring_prod++) { + WRITE_ONCE(u->ring_prod, prod + 1); + if (cons == prod) { wake_up_interruptible(&u->evtchn_wait); kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); @@ -210,8 +215,8 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, if (u->ring_overflow) goto unlock_out; - c = u->ring_cons; - p = u->ring_prod; + c = READ_ONCE(u->ring_cons); + p = READ_ONCE(u->ring_prod); if (c != p) break; @@ -221,7 +226,7 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, return -EAGAIN; rc = wait_event_interruptible(u->evtchn_wait, - u->ring_cons != u->ring_prod); + READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)); if (rc) return rc; } @@ -251,7 +256,7 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) goto unlock_out; - u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + WRITE_ONCE(u->ring_cons, c + (bytes1 + bytes2) / sizeof(evtchn_port_t)); rc = bytes1 + bytes2; unlock_out: @@ -552,7 +557,9 @@ static long evtchn_ioctl(struct file *file, /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); spin_lock_irq(&u->ring_prod_lock); - u->ring_cons = u->ring_prod = u->ring_overflow = 0; + WRITE_ONCE(u->ring_cons, 0); + WRITE_ONCE(u->ring_prod, 0); + u->ring_overflow = 0; spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; @@ -595,7 +602,7 @@ static __poll_t evtchn_poll(struct file *file, poll_table *wait) struct per_user_data *u = file->private_data; poll_wait(file, &u->evtchn_wait, wait); - if (u->ring_cons != u->ring_prod) + if (READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)) mask |= EPOLLIN | EPOLLRDNORM; if (u->ring_overflow) mask = EPOLLERR; From 43135df0d7f0a66c75143a1e95ed70a2005ca329 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Feb 2021 15:46:18 -0800 Subject: [PATCH 320/506] xen: Replace lkml.org links with lore As started by commit 05a5f51ca566 ("Documentation: Replace lkml.org links with lore"), replace lkml.org links with lore to better use a single source that's more likely to stay available long-term. Signed-off-by: Kees Cook Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210210234618.2734785-1-keescook@chromium.org Signed-off-by: Boris Ostrovsky --- drivers/xen/xen-acpi-processor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index ce8ffb595a46..df7cab870be5 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -3,7 +3,8 @@ * Copyright 2012 by Oracle Inc * Author: Konrad Rzeszutek Wilk * - * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * This code borrows ideas from + * https://lore.kernel.org/lkml/1322673664-14642-6-git-send-email-konrad.wilk@oracle.com * so many thanks go to Kevin Tian * and Yu Ke . */ From 67b45af946ec3148b64e6a3a1ee2ea8f79c5bc07 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 23 Feb 2021 09:39:57 +0800 Subject: [PATCH 321/506] KVM: vmx/pmu: Fix dummy check if lbr_desc->event is created If lbr_desc->event is successfully created, the intel_pmu_create_ guest_lbr_event() will return 0, otherwise it will return -ENOENT, and then jump to LBR msrs dummy handling. Fixes: 1b5ac3226a1a ("KVM: vmx/pmu: Pass-through LBR msrs when the guest LBR event is ACTIVE") Signed-off-by: Like Xu Message-Id: <20210223013958.1280444-1-like.xu@linux.intel.com> [Add "< 0" and PTR_ERR to make the code clearer. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index d1df618cb7de..9efc1a6b8693 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -298,7 +298,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) if (IS_ERR(event)) { pr_debug_ratelimited("%s: failed %ld\n", __func__, PTR_ERR(event)); - return -ENOENT; + return PTR_ERR(event); } lbr_desc->event = event; pmu->event_count++; @@ -320,7 +320,7 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) return false; - if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu)) + if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0) goto dummy; /* From 53f131c284e83c29c227c0938926a82b2ed4d7ba Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 23 Feb 2021 17:26:21 +0100 Subject: [PATCH 322/506] xen-front-pgdir-shbuf: don't record wrong grant handle upon error In order for subsequent unmapping to not mistakenly unmap handle 0, record a perceived always-invalid one instead. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Reviewed-by: Oleksandr Andrushchenko Link: https://lore.kernel.org/r/82414b0f-1b63-5509-7c1d-5bcc8239a3de@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/xen-front-pgdir-shbuf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c index 48a658dc7ccf..81b6e13fa5ec 100644 --- a/drivers/xen/xen-front-pgdir-shbuf.c +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -305,11 +305,18 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf) /* Save handles even if error, so we can unmap. */ for (cur_page = 0; cur_page < buf->num_pages; cur_page++) { - buf->backend_map_handles[cur_page] = map_ops[cur_page].handle; - if (unlikely(map_ops[cur_page].status != GNTST_okay)) + if (likely(map_ops[cur_page].status == GNTST_okay)) { + buf->backend_map_handles[cur_page] = + map_ops[cur_page].handle; + } else { + buf->backend_map_handles[cur_page] = + INVALID_GRANT_HANDLE; + if (!ret) + ret = -ENXIO; dev_err(&buf->xb_dev->dev, "Failed to map page %d: %d\n", cur_page, map_ops[cur_page].status); + } } if (ret) { From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 23 Feb 2021 14:18:58 +0100 Subject: [PATCH 323/506] net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending The icmp{,v6}_send functions make all sorts of use of skb->cb, casting it with IPCB or IP6CB, assuming the skb to have come directly from the inet layer. But when the packet comes from the ndo layer, especially when forwarded, there's no telling what might be in skb->cb at that point. As a result, the icmp sending code risks reading bogus memory contents, which can result in nasty stack overflows such as this one reported by a user: panic+0x108/0x2ea __stack_chk_fail+0x14/0x20 __icmp_send+0x5bd/0x5c0 icmp_ndo_send+0x148/0x160 In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read from it. The optlen parameter there is of particular note, as it can induce writes beyond bounds. There are quite a few ways that can happen in __ip_options_echo. For example: // sptr/skb are attacker-controlled skb bytes sptr = skb_network_header(skb); // dptr/dopt points to stack memory allocated by __icmp_send dptr = dopt->__data; // sopt is the corrupt skb->cb in question if (sopt->rr) { optlen = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data // this now writes potentially attacker-controlled data, over // flowing the stack: memcpy(dptr, sptr+sopt->rr, optlen); } In the icmpv6_send case, the story is similar, but not as dire, as only IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is worse than the iif case, but it is passed to ipv6_find_tlv, which does a bit of bounds checking on the value. This is easy to simulate by doing a `memset(skb->cb, 0x41, sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by good fortune and the rarity of icmp sending from that context that we've avoided reports like this until now. For example, in KASAN: BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0 Write of size 38 at addr ffff888006f1f80e by task ping/89 CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5 Call Trace: dump_stack+0x9a/0xcc print_address_description.constprop.0+0x1a/0x160 __kasan_report.cold+0x20/0x38 kasan_report+0x32/0x40 check_memory_region+0x145/0x1a0 memcpy+0x39/0x60 __ip_options_echo+0xa0e/0x12b0 __icmp_send+0x744/0x1700 Actually, out of the 4 drivers that do this, only gtp zeroed the cb for the v4 case, while the rest did not. So this commit actually removes the gtp-specific zeroing, while putting the code where it belongs in the shared infrastructure of icmp{,v6}_ndo_send. This commit fixes the issue by passing an empty IPCB or IP6CB along to the functions that actually do the work. For the icmp_send, this was already trivial, thanks to __icmp_send providing the plumbing function. For icmpv6_send, this required a tiny bit of refactoring to make it behave like the v4 case, after which it was straight forward. Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs") Reported-by: SinYu Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/ Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 1 - include/linux/icmpv6.h | 26 ++++++++++++++++++++------ include/linux/ipv6.h | 1 - include/net/icmp.h | 6 +++++- net/ipv4/icmp.c | 5 +++-- net/ipv6/icmp.c | 18 +++++++++--------- net/ipv6/ip6_icmp.c | 12 +++++++----- 7 files changed, 44 insertions(+), 25 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 9a70f05baf6e..39c00f050fbd 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -543,7 +543,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 452d8978ffc7..9055cb380ee2 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -3,6 +3,7 @@ #define _LINUX_ICMPV6_H #include +#include #include static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) @@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr); + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr); + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) -static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm) { - icmp6_send(skb, type, code, info, NULL); + icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { @@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) return 0; } #else -extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info); +extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif +static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + __icmpv6_send(skb, type, code, info, IP6CB(skb)); +} + int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else -#define icmpv6_ndo_send icmpv6_send +static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) +{ + struct inet6_skb_parm parm = { 0 }; + __icmpv6_send(skb_in, type, code, info, &parm); +} #endif #else diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9d1f29f0c512..70b2ad3b9884 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -85,7 +85,6 @@ struct ipv6_params { __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; -#include #include #include diff --git a/include/net/icmp.h b/include/net/icmp.h index 9ac2d2672a93..fd84adc47963 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 #if IS_ENABLED(CONFIG_NF_NAT) void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else -#define icmp_ndo_send icmp_send +static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct ip_options opts = { 0 }; + __icmp_send(skb_in, type, code, info, &opts); +} #endif int icmp_rcv(struct sk_buff *skb); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 396b492c804f..616e2dc1c8fa 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; + struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; struct nf_conn *ct; __be32 orig_ip; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); return; } @@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; - icmp_send(skb_in, type, code, info); + __icmp_send(skb_in, type, code, info, &opts); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f3d05866692e..fd1f896115c1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st } #if IS_ENABLED(CONFIG_IPV6_MIP6) -static void mip6_addr_swap(struct sk_buff *skb) +static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); - struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6_destopt_hao *hao; struct in6_addr tmp; int off; @@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buff *skb) } } #else -static inline void mip6_addr_swap(struct sk_buff *skb) {} +static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, @@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buff *skb) * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr) + const struct in6_addr *force_saddr, + const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; - mip6_addr_swap(skb); + mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) @@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, /* select a more meaningful saddr from input if */ struct net_device *in_netdev; - in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); + in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, @@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send); */ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb(skb); } @@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, - info, &temp_saddr); + info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 70c8c2f36c98..9e3574880cb0 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) } EXPORT_SYMBOL(inet6_unregister_icmp_sender); -void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) - send(skb, type, code, info, NULL); + send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } -EXPORT_SYMBOL(icmpv6_send); +EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { + struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; @@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; - icmpv6_send(skb_in, type, code, info); + __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); From 9bc1ef64aeb6f7dae17e98f912213266738ddcfe Mon Sep 17 00:00:00 2001 From: Sieng Piaw Liew Date: Mon, 22 Feb 2021 09:35:30 +0800 Subject: [PATCH 324/506] bcm63xx_enet: fix sporadic kernel panic In ndo_stop functions, netdev_completed_queue() is called during forced tx reclaim, after netdev_reset_queue(). This may trigger kernel panic if there is any tx skb left. This patch moves netdev_reset_queue() to after tx reclaim, so BQL can complete successfully then reset. Signed-off-by: Sieng Piaw Liew Fixes: 4c59b0f5543d ("bcm63xx_enet: add BQL support") Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210222013530.1356-1-liew.s.piaw@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index fd8767213165..977f097fc7bf 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1192,7 +1192,6 @@ static int bcm_enet_stop(struct net_device *dev) kdev = &priv->pdev->dev; netif_stop_queue(dev); - netdev_reset_queue(dev); napi_disable(&priv->napi); if (priv->has_phy) phy_stop(dev->phydev); @@ -1231,6 +1230,9 @@ static int bcm_enet_stop(struct net_device *dev) if (priv->has_phy) phy_disconnect(dev->phydev); + /* reset BQL after forced tx reclaim to prevent kernel panic */ + netdev_reset_queue(dev); + return 0; } @@ -2343,7 +2345,6 @@ static int bcm_enetsw_stop(struct net_device *dev) del_timer_sync(&priv->swphy_poll); netif_stop_queue(dev); - netdev_reset_queue(dev); napi_disable(&priv->napi); del_timer_sync(&priv->rx_timeout); @@ -2371,6 +2372,9 @@ static int bcm_enetsw_stop(struct net_device *dev) free_irq(priv->irq_tx, dev); free_irq(priv->irq_rx, dev); + /* reset BQL after forced tx reclaim to prevent kernel panic */ + netdev_reset_queue(dev); + return 0; } From 18755e270666ce869289bceb734d25eae2be9da9 Mon Sep 17 00:00:00 2001 From: Krzysztof Halasa Date: Thu, 18 Feb 2021 13:34:42 +0100 Subject: [PATCH 325/506] Marvell Sky2 Ethernet adapter: fix warning messages. sky2.c driver uses netdev_warn() before the net device is initialized. Fix it by using dev_warn() instead. Signed-off-by: Krzysztof Halasa Link: https://lore.kernel.org/r/m3a6s1r1ul.fsf@t19.piap.pl Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index ebe1406c6e64..dbec8e187a68 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4806,12 +4806,11 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port, if (!is_valid_ether_addr(dev->dev_addr)) { struct sockaddr sa = { AF_UNSPEC }; - netdev_warn(dev, - "Invalid MAC address, defaulting to random\n"); + dev_warn(&hw->pdev->dev, "Invalid MAC address, defaulting to random\n"); eth_hw_addr_random(dev); memcpy(sa.sa_data, dev->dev_addr, ETH_ALEN); if (sky2_set_mac_address(dev, &sa)) - netdev_warn(dev, "Failed to set MAC address.\n"); + dev_warn(&hw->pdev->dev, "Failed to set MAC address.\n"); } return dev; From e6dd86ed27d1a56bd45c50f6cc238a94c283e8e2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Feb 2021 14:30:09 -0800 Subject: [PATCH 326/506] net: dsa: bcm_sf2: Wire-up br_flags_pre, br_flags and set_mrouter Because bcm_sf2 implements its own dsa_switch_ops we need to export the b53_br_flags_pre(), b53_br_flags() and b53_set_mrouter so we can wire-up them up like they used to be with the former b53_br_egress_floods(). Fixes: a8b659e7ff75 ("net: dsa: act as passthrough for bridge port flags") Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 19 +++++++++++-------- drivers/net/dsa/b53/b53_priv.h | 8 ++++++++ drivers/net/dsa/bcm_sf2.c | 3 +++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index ae86ded1e2a1..fceca3f5b6a5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1953,19 +1953,20 @@ void b53_br_fast_age(struct dsa_switch *ds, int port) } EXPORT_SYMBOL(b53_br_fast_age); -static int b53_br_flags_pre(struct dsa_switch *ds, int port, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack) +int b53_br_flags_pre(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) return -EINVAL; return 0; } +EXPORT_SYMBOL(b53_br_flags_pre); -static int b53_br_flags(struct dsa_switch *ds, int port, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack) +int b53_br_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { if (flags.mask & BR_FLOOD) b53_port_set_ucast_flood(ds->priv, port, @@ -1976,14 +1977,16 @@ static int b53_br_flags(struct dsa_switch *ds, int port, return 0; } +EXPORT_SYMBOL(b53_br_flags); -static int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, - struct netlink_ext_ack *extack) +int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack) { b53_port_set_mcast_flood(ds->priv, port, mrouter); return 0; } +EXPORT_SYMBOL(b53_set_mrouter); static bool b53_possible_cpu_port(struct dsa_switch *ds, int port) { diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index faf983fbca82..8419bb7f4505 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -326,6 +326,14 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state); void b53_br_fast_age(struct dsa_switch *ds, int port); +int b53_br_flags_pre(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int b53_br_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack); int b53_setup_devlink_resources(struct dsa_switch *ds); void b53_port_event(struct dsa_switch *ds, int port); void b53_phylink_validate(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 1857aa9aa84a..3eaedbb12815 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1117,7 +1117,10 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, + .port_pre_bridge_flags = b53_br_flags_pre, + .port_bridge_flags = b53_br_flags, .port_stp_state_set = b53_br_set_stp_state, + .port_set_mrouter = b53_set_mrouter, .port_fast_age = b53_br_fast_age, .port_vlan_filtering = b53_vlan_filtering, .port_vlan_add = b53_vlan_add, From f9b3827ee66cfcf297d0acd6ecf33653a5f297ef Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Feb 2021 14:30:10 -0800 Subject: [PATCH 327/506] net: dsa: b53: Support setting learning on port Add support for being able to set the learning attribute on port, and make sure that the standalone ports start up with learning disabled. We can remove the code in bcm_sf2 that configured the ports learning attribute because we want the standalone ports to have learning disabled by default and port 7 cannot be bridged, so its learning attribute will not change past its initial configuration. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 20 +++++++++++++++++++- drivers/net/dsa/b53/b53_regs.h | 1 + drivers/net/dsa/bcm_sf2.c | 15 +-------------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index fceca3f5b6a5..a162499bcafc 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -543,6 +543,19 @@ static void b53_port_set_mcast_flood(struct b53_device *dev, int port, b53_write16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, mc); } +static void b53_port_set_learning(struct b53_device *dev, int port, + bool learning) +{ + u16 reg; + + b53_read16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, ®); + if (learning) + reg &= ~BIT(port); + else + reg |= BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, reg); +} + int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; @@ -557,6 +570,7 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) b53_port_set_ucast_flood(dev, port, true); b53_port_set_mcast_flood(dev, port, true); + b53_port_set_learning(dev, port, false); if (dev->ops->irq_enable) ret = dev->ops->irq_enable(dev, port); @@ -691,6 +705,7 @@ static void b53_enable_cpu_port(struct b53_device *dev, int port) b53_port_set_ucast_flood(dev, port, true); b53_port_set_mcast_flood(dev, port, true); + b53_port_set_learning(dev, port, false); } static void b53_enable_mib(struct b53_device *dev) @@ -1957,7 +1972,7 @@ int b53_br_flags_pre(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack) { - if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) + if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD | BR_LEARNING)) return -EINVAL; return 0; @@ -1974,6 +1989,9 @@ int b53_br_flags(struct dsa_switch *ds, int port, if (flags.mask & BR_MCAST_FLOOD) b53_port_set_mcast_flood(ds->priv, port, !!(flags.val & BR_MCAST_FLOOD)); + if (flags.mask & BR_LEARNING) + b53_port_set_learning(ds->priv, port, + !!(flags.val & BR_LEARNING)); return 0; } diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index c90985c294a2..b2c539a42154 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -115,6 +115,7 @@ #define B53_UC_FLOOD_MASK 0x32 #define B53_MC_FLOOD_MASK 0x34 #define B53_IPMC_FLOOD_MASK 0x36 +#define B53_DIS_LEARNING 0x3c /* * Override Ports 0-7 State on devices with xMII interfaces (8 bit) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 3eaedbb12815..5ee8103b8e9c 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -223,23 +223,10 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg &= ~P_TXQ_PSM_VDD(port); core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); - /* Enable learning */ - reg = core_readl(priv, CORE_DIS_LEARN); - reg &= ~BIT(port); - core_writel(priv, reg, CORE_DIS_LEARN); - /* Enable Broadcom tags for that port if requested */ - if (priv->brcm_tag_mask & BIT(port)) { + if (priv->brcm_tag_mask & BIT(port)) b53_brcm_hdr_setup(ds, port); - /* Disable learning on ASP port */ - if (port == 7) { - reg = core_readl(priv, CORE_DIS_LEARN); - reg |= BIT(port); - core_writel(priv, reg, CORE_DIS_LEARN); - } - } - /* Configure Traffic Class to QoS mapping, allow each priority to map * to a different queue number */ From 3aed8b63336c3f81a4fd72808dcf6197fabbbdb2 Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 23 Feb 2021 15:11:55 +0800 Subject: [PATCH 328/506] net/sched: cls_flower: validate ct_state for invalid and reply flags Add invalid and reply flags validate in the fl_validate_ct_state. This makes the checking complete if compared to ovs' validate_ct_state(). Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/1614064315-364-1-git-send-email-wenxu@ucloud.cn Signed-off-by: Jakub Kicinski --- net/sched/cls_flower.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2409e522c68f..d097b5c15faa 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1417,6 +1417,21 @@ static int fl_validate_ct_state(u16 state, struct nlattr *tb, return -EINVAL; } + if (state & TCA_FLOWER_KEY_CT_FLAGS_INVALID && + state & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "when inv is set, only trk may be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_REPLY) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and rpl are mutually exclusive"); + return -EINVAL; + } + return 0; } From 7a0ae61acde2cebd69665837170405eced86a6c7 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:40 +0800 Subject: [PATCH 329/506] r8152: enable U1/U2 for USB_SPEED_SUPER U1/U2 shoued be enabled for USB 3.0 or later. The USB 2.0 doesn't support it. Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 2d7cc63bef89..4bfee289aa6f 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3360,7 +3360,7 @@ static void rtl8153b_runtime_enable(struct r8152 *tp, bool enable) r8153b_ups_en(tp, false); r8153_queue_wake(tp, false); rtl_runtime_suspend_enable(tp, false); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); } } @@ -5056,7 +5056,7 @@ static void rtl8153b_up(struct r8152 *tp) r8153_aldps_en(tp, true); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); } @@ -5572,8 +5572,9 @@ static void r8153b_init(struct r8152 *tp) ocp_data |= POLL_LINK_CHG; ocp_write_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS, ocp_data); - if (tp->udev->speed != USB_SPEED_HIGH) + if (tp->udev->speed >= USB_SPEED_SUPER) r8153b_u1u2en(tp, true); + usb_enable_lpm(tp->udev); /* MAC clock speed down */ From c79515e47935c747282c6ed2ee5b2ef039756eeb Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:41 +0800 Subject: [PATCH 330/506] r8152: check if the pointer of the function exists Return error code if autosuspend_en, eee_get, or eee_set don't exist. Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 4bfee289aa6f..baa63ea2590a 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -5757,6 +5757,9 @@ static int rtl8152_runtime_suspend(struct r8152 *tp) struct net_device *netdev = tp->netdev; int ret = 0; + if (!tp->rtl_ops.autosuspend_en) + return -EBUSY; + set_bit(SELECTIVE_SUSPEND, &tp->flags); smp_mb__after_atomic(); @@ -6156,6 +6159,11 @@ rtl_ethtool_get_eee(struct net_device *net, struct ethtool_eee *edata) struct r8152 *tp = netdev_priv(net); int ret; + if (!tp->rtl_ops.eee_get) { + ret = -EOPNOTSUPP; + goto out; + } + ret = usb_autopm_get_interface(tp->intf); if (ret < 0) goto out; @@ -6178,6 +6186,11 @@ rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata) struct r8152 *tp = netdev_priv(net); int ret; + if (!tp->rtl_ops.eee_set) { + ret = -EOPNOTSUPP; + goto out; + } + ret = usb_autopm_get_interface(tp->intf); if (ret < 0) goto out; From 156c3207611262266f0eea589ac3f00c5657320e Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:42 +0800 Subject: [PATCH 331/506] r8152: replace netif_err with dev_err Some messages are before calling register_netdev(), so replace netif_err() with dev_err(). Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index baa63ea2590a..82a129264e31 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -6590,7 +6590,7 @@ static int rtl_ops_init(struct r8152 *tp) default: ret = -ENODEV; - netif_err(tp, probe, tp->netdev, "Unknown Device\n"); + dev_err(&tp->intf->dev, "Unknown Device\n"); break; } @@ -6847,7 +6847,7 @@ static int rtl8152_probe(struct usb_interface *intf, ret = register_netdev(netdev); if (ret != 0) { - netif_err(tp, probe, netdev, "couldn't register the device\n"); + dev_err(&intf->dev, "couldn't register the device\n"); goto out1; } From 40fa7568ac230446d888b7ad402cff9e20fe3ad5 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Feb 2021 17:04:43 +0800 Subject: [PATCH 332/506] r8152: spilt rtl_set_eee_plus and r8153b_green_en Add rtl_eee_plus_en() and rtl_green_en(). Signed-off-by: Hayes Wang Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 82a129264e31..b246817f3405 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2632,21 +2632,24 @@ static inline u8 rtl8152_get_speed(struct r8152 *tp) return ocp_read_byte(tp, MCU_TYPE_PLA, PLA_PHYSTATUS); } -static void rtl_set_eee_plus(struct r8152 *tp) +static void rtl_eee_plus_en(struct r8152 *tp, bool enable) { u32 ocp_data; - u8 speed; - speed = rtl8152_get_speed(tp); - if (speed & _10bps) { - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + if (enable) ocp_data |= EEEP_CR_EEEP_TX; - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); - } else { - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR); + else ocp_data &= ~EEEP_CR_EEEP_TX; - ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); - } + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EEEP_CR, ocp_data); +} + +static void rtl_set_eee_plus(struct r8152 *tp) +{ + if (rtl8152_get_speed(tp) & _10bps) + rtl_eee_plus_en(tp, true); + else + rtl_eee_plus_en(tp, false); } static void rxdy_gated_en(struct r8152 *tp, bool enable) @@ -3150,10 +3153,22 @@ static void r8153b_ups_flags(struct r8152 *tp) ocp_write_dword(tp, MCU_TYPE_USB, USB_UPS_FLAGS, ups_flags); } -static void r8153b_green_en(struct r8152 *tp, bool enable) +static void rtl_green_en(struct r8152 *tp, bool enable) { u16 data; + data = sram_read(tp, SRAM_GREEN_CFG); + if (enable) + data |= GREEN_ETH_EN; + else + data &= ~GREEN_ETH_EN; + sram_write(tp, SRAM_GREEN_CFG, data); + + tp->ups_info.green = enable; +} + +static void r8153b_green_en(struct r8152 *tp, bool enable) +{ if (enable) { sram_write(tp, 0x8045, 0); /* 10M abiq&ldvbias */ sram_write(tp, 0x804d, 0x1222); /* 100M short abiq&ldvbias */ @@ -3164,11 +3179,7 @@ static void r8153b_green_en(struct r8152 *tp, bool enable) sram_write(tp, 0x805d, 0x2444); /* 1000M short abiq&ldvbias */ } - data = sram_read(tp, SRAM_GREEN_CFG); - data |= GREEN_ETH_EN; - sram_write(tp, SRAM_GREEN_CFG, data); - - tp->ups_info.green = enable; + rtl_green_en(tp, true); } static u16 r8153_phy_status(struct r8152 *tp, u16 desired) From 92584ddf550ae72d492858c19d1f9025e07a9350 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Feb 2021 15:45:52 +0000 Subject: [PATCH 333/506] vxlan: move debug check after netdev unregister The debug check must be done after unregister_netdevice_many() call -- the hlist_del_rcu() for this is done inside .ndo_stop. This is the same with commit 0fda7600c2e1 ("geneve: move debug check after netdev unregister") Test commands: ip netns del A ip netns add A ip netns add B ip netns exec B ip link add vxlan0 type vxlan vni 100 local 10.0.0.1 \ remote 10.0.0.2 dstport 4789 srcport 4789 4789 ip netns exec B ip link set vxlan0 netns A ip netns exec A ip link set vxlan0 up ip netns del B Splat looks like: [ 73.176249][ T7] ------------[ cut here ]------------ [ 73.178662][ T7] WARNING: CPU: 4 PID: 7 at drivers/net/vxlan.c:4743 vxlan_exit_batch_net+0x52e/0x720 [vxlan] [ 73.182597][ T7] Modules linked in: vxlan openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 mlx5_core nfp mlxfw ixgbevf tls sch_fq_codel nf_tables nfnetlink ip_tables x_tables unix [ 73.190113][ T7] CPU: 4 PID: 7 Comm: kworker/u16:0 Not tainted 5.11.0-rc7+ #838 [ 73.193037][ T7] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 73.196986][ T7] Workqueue: netns cleanup_net [ 73.198946][ T7] RIP: 0010:vxlan_exit_batch_net+0x52e/0x720 [vxlan] [ 73.201509][ T7] Code: 00 01 00 00 0f 84 39 fd ff ff 48 89 ca 48 c1 ea 03 80 3c 1a 00 0f 85 a6 00 00 00 89 c2 48 83 c2 02 49 8b 14 d4 48 85 d2 74 ce <0f> 0b eb ca e8 b9 51 db dd 84 c0 0f 85 4a fe ff ff 48 c7 c2 80 bc [ 73.208813][ T7] RSP: 0018:ffff888100907c10 EFLAGS: 00010286 [ 73.211027][ T7] RAX: 000000000000003c RBX: dffffc0000000000 RCX: ffff88800ec411f0 [ 73.213702][ T7] RDX: ffff88800a278000 RSI: ffff88800fc78c70 RDI: ffff88800fc78070 [ 73.216169][ T7] RBP: ffff88800b5cbdc0 R08: fffffbfff424de61 R09: fffffbfff424de61 [ 73.218463][ T7] R10: ffffffffa126f307 R11: fffffbfff424de60 R12: ffff88800ec41000 [ 73.220794][ T7] R13: ffff888100907d08 R14: ffff888100907c50 R15: ffff88800fc78c40 [ 73.223337][ T7] FS: 0000000000000000(0000) GS:ffff888114800000(0000) knlGS:0000000000000000 [ 73.225814][ T7] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 73.227616][ T7] CR2: 0000562b5cb4f4d0 CR3: 0000000105fbe001 CR4: 00000000003706e0 [ 73.229700][ T7] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 73.231820][ T7] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 73.233844][ T7] Call Trace: [ 73.234698][ T7] ? vxlan_err_lookup+0x3c0/0x3c0 [vxlan] [ 73.235962][ T7] ? ops_exit_list.isra.11+0x93/0x140 [ 73.237134][ T7] cleanup_net+0x45e/0x8a0 [ ... ] Fixes: 57b61127ab7d ("vxlan: speedup vxlan tunnels dismantle") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20210221154552.11749-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3929e437382b..666dd201c3d5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -4721,7 +4721,6 @@ static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; struct net_device *dev, *aux; - unsigned int h; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &vxlan_link_ops) @@ -4735,14 +4734,13 @@ static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) unregister_netdevice_queue(vxlan->dev, head); } - for (h = 0; h < PORT_HASH_SIZE; ++h) - WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) { struct net *net; LIST_HEAD(list); + unsigned int h; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { @@ -4755,6 +4753,13 @@ static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) unregister_netdevice_many(&list); rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); + } } static struct pernet_operations vxlan_net_ops = { From fc0494ead6398609c49afa37bc949b61c5c16b91 Mon Sep 17 00:00:00 2001 From: Takeshi Misawa Date: Mon, 22 Feb 2021 08:44:27 +0900 Subject: [PATCH 334/506] net: qrtr: Fix memory leak in qrtr_tun_open If qrtr_endpoint_register() failed, tun is leaked. Fix this, by freeing tun in error path. syzbot report: BUG: memory leak unreferenced object 0xffff88811848d680 (size 64): comm "syz-executor684", pid 10171, jiffies 4294951561 (age 26.070s) hex dump (first 32 bytes): 80 dd 0a 84 ff ff ff ff 00 00 00 00 00 00 00 00 ................ 90 d6 48 18 81 88 ff ff 90 d6 48 18 81 88 ff ff ..H.......H..... backtrace: [<0000000018992a50>] kmalloc include/linux/slab.h:552 [inline] [<0000000018992a50>] kzalloc include/linux/slab.h:682 [inline] [<0000000018992a50>] qrtr_tun_open+0x22/0x90 net/qrtr/tun.c:35 [<0000000003a453ef>] misc_open+0x19c/0x1e0 drivers/char/misc.c:141 [<00000000dec38ac8>] chrdev_open+0x10d/0x340 fs/char_dev.c:414 [<0000000079094996>] do_dentry_open+0x1e6/0x620 fs/open.c:817 [<000000004096d290>] do_open fs/namei.c:3252 [inline] [<000000004096d290>] path_openat+0x74a/0x1b00 fs/namei.c:3369 [<00000000b8e64241>] do_filp_open+0xa0/0x190 fs/namei.c:3396 [<00000000a3299422>] do_sys_openat2+0xed/0x230 fs/open.c:1172 [<000000002c1bdcef>] do_sys_open fs/open.c:1188 [inline] [<000000002c1bdcef>] __do_sys_openat fs/open.c:1204 [inline] [<000000002c1bdcef>] __se_sys_openat fs/open.c:1199 [inline] [<000000002c1bdcef>] __x64_sys_openat+0x7f/0xe0 fs/open.c:1199 [<00000000f3a5728f>] do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 [<000000004b38b7ec>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 28fb4e59a47d ("net: qrtr: Expose tunneling endpoint to user space") Reported-by: syzbot+5d6e4af21385f5cfc56a@syzkaller.appspotmail.com Signed-off-by: Takeshi Misawa Link: https://lore.kernel.org/r/20210221234427.GA2140@DESKTOP Signed-off-by: Jakub Kicinski --- net/qrtr/tun.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index b238c40a9984..304b41fea5ab 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -31,6 +31,7 @@ static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb) static int qrtr_tun_open(struct inode *inode, struct file *filp) { struct qrtr_tun *tun; + int ret; tun = kzalloc(sizeof(*tun), GFP_KERNEL); if (!tun) @@ -43,7 +44,16 @@ static int qrtr_tun_open(struct inode *inode, struct file *filp) filp->private_data = tun; - return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + ret = qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); + if (ret) + goto out; + + return 0; + +out: + filp->private_data = NULL; + kfree(tun); + return ret; } static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to) From 30ac4e2f54ec067b7b9ca0db27e75681581378d6 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 22 Feb 2021 17:25:43 +0100 Subject: [PATCH 335/506] wireguard: avoid double unlikely() notation when using IS_ERR() The definition of IS_ERR() already applies the unlikely() notation when checking the error status of the passed pointer. For this reason there is no need to have the same notation outside of IS_ERR() itself. Clean up code by removing redundant notation. Signed-off-by: Antonio Quartulli Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 2 +- drivers/net/wireguard/socket.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index a3ed49cd95c3..cd51a2afa28e 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -157,7 +157,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) } else { struct sk_buff *segs = skb_gso_segment(skb, 0); - if (unlikely(IS_ERR(segs))) { + if (IS_ERR(segs)) { ret = PTR_ERR(segs); goto err_peer; } diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 410b318e57fb..41430c0e465a 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -71,7 +71,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, ip_rt_put(rt); rt = ip_route_output_flow(sock_net(sock), &fl, sock); } - if (unlikely(IS_ERR(rt))) { + if (IS_ERR(rt)) { ret = PTR_ERR(rt); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); @@ -138,7 +138,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, } dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl, NULL); - if (unlikely(IS_ERR(dst))) { + if (IS_ERR(dst)) { ret = PTR_ERR(dst); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); From 7f57bd8dc22de35ddd895294aa554003e4f19a72 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 22 Feb 2021 17:25:44 +0100 Subject: [PATCH 336/506] wireguard: socket: remove bogus __be32 annotation The endpoint->src_if4 has nothing to do with fixed-endian numbers; remove the bogus annotation. This was introduced in https://git.zx2c4.com/wireguard-monolithic-historical/commit?id=14e7d0a499a676ec55176c0de2f9fcbd34074a82 in the historical WireGuard repo because the old code used to zero-initialize multiple members as follows: endpoint->src4.s_addr = endpoint->src_if4 = fl.saddr = 0; Because fl.saddr is fixed-endian and an assignment returns a value with the type of its left operand, this meant that sparse detected an assignment between values of different endianness. Since then, this assignment was already split up into separate statements; just the cast survived. Signed-off-by: Jann Horn Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 41430c0e465a..d9ad850daa79 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -53,7 +53,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0, fl.saddr, RT_SCOPE_HOST))) { endpoint->src4.s_addr = 0; - *(__force __be32 *)&endpoint->src_if4 = 0; + endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); @@ -63,7 +63,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) && rt->dst.dev->ifindex != endpoint->src_if4)))) { endpoint->src4.s_addr = 0; - *(__force __be32 *)&endpoint->src_if4 = 0; + endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); From d5a49aa6c3e264a93a7d08485d66e346be0969dd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:45 +0100 Subject: [PATCH 337/506] wireguard: selftests: test multiple parallel streams In order to test ndo_start_xmit being called in parallel, explicitly add separate tests, which should all run on different cores. This should help tease out bugs associated with queueing up packets from different cores in parallel. Currently, it hasn't found those types of bugs, but given future planned work, this is a useful regression to avoid. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/netns.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 74c69b75f6f5..7ed7cd95e58f 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; } ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } sleep() { read -t "$1" -N 1 || true; } -waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } +waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; } @@ -141,6 +141,19 @@ tests() { n2 iperf3 -s -1 -B fd00::2 & waitiperf $netns2 $! n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2 + + # TCP over IPv4, in parallel + for max in 4 5 50; do + local pids=( ) + for ((i=0; i < max; ++i)) do + n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 & + pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i )) + done + for ((i=0; i < max; ++i)) do + n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 & + done + wait "${pids[@]}" + done } [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}" From 5a0598695634a6bb4126818902dd9140cd9df8b6 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:46 +0100 Subject: [PATCH 338/506] wireguard: peer: put frequently used members above cache lines The is_dead boolean is checked for every single packet, while the internal_id member is used basically only for pr_debug messages. So it makes sense to hoist up is_dead into some space formerly unused by a struct hole, while demoting internal_api to below the lowest struct cache line. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/peer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/peer.h b/drivers/net/wireguard/peer.h index 23af40922997..aaff8de6e34b 100644 --- a/drivers/net/wireguard/peer.h +++ b/drivers/net/wireguard/peer.h @@ -39,6 +39,7 @@ struct wg_peer { struct crypt_queue tx_queue, rx_queue; struct sk_buff_head staged_packet_queue; int serial_work_cpu; + bool is_dead; struct noise_keypairs keypairs; struct endpoint endpoint; struct dst_cache endpoint_cache; @@ -61,9 +62,8 @@ struct wg_peer { struct rcu_head rcu; struct list_head peer_list; struct list_head allowedips_list; - u64 internal_id; struct napi_struct napi; - bool is_dead; + u64 internal_id; }; struct wg_peer *wg_peer_create(struct wg_device *wg, From 99fff5264e7ab06f45b0ad60243475be0a8d0559 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:47 +0100 Subject: [PATCH 339/506] wireguard: device: do not generate ICMP for non-IP packets If skb->protocol doesn't match the actual skb->data header, it's probably not a good idea to pass it off to icmp{,v6}_ndo_send, which is expecting to reply to a valid IP packet. So this commit has that early mismatch case jump to a later error label. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index cd51a2afa28e..8502e1b083ff 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -138,7 +138,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) else if (skb->protocol == htons(ETH_P_IPV6)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", dev->name, &ipv6_hdr(skb)->daddr); - goto err; + goto err_icmp; } family = READ_ONCE(peer->endpoint.addr.sa_family); @@ -201,12 +201,13 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) err_peer: wg_peer_put(peer); -err: - ++dev->stats.tx_errors; +err_icmp: if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); +err: + ++dev->stats.tx_errors; kfree_skb(skb); return ret; } From 8b5553ace83cced775eefd0f3f18b5c6214ccf7a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:48 +0100 Subject: [PATCH 340/506] wireguard: queueing: get rid of per-peer ring buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having two ring buffers per-peer means that every peer results in two massive ring allocations. On an 8-core x86_64 machine, this commit reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which is an 90% reduction. Ninety percent! With some single-machine deployments approaching 500,000 peers, we're talking about a reduction from 7 gigs of memory down to 700 megs of memory. In order to get rid of these per-peer allocations, this commit switches to using a list-based queueing approach. Currently GSO fragments are chained together using the skb->next pointer (the skb_list_* singly linked list approach), so we form the per-peer queue around the unused skb->prev pointer (which sort of makes sense because the links are pointing backwards). Use of skb_queue_* is not possible here, because that is based on doubly linked lists and spinlocks. Multiple cores can write into the queue at any given time, because its writes occur in the start_xmit path or in the udp_recv path. But reads happen in a single workqueue item per-peer, amounting to a multi-producer, single-consumer paradigm. The MPSC queue is implemented locklessly and never blocks. However, it is not linearizable (though it is serializable), with a very tight and unlikely race on writes, which, when hit (some tiny fraction of the 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes the queue reader to terminate early. However, because every packet sent queues up the same workqueue item after it is fully added, the worker resumes again, and stopping early isn't actually a problem, since at that point the packet wouldn't have yet been added to the encryption queue. These properties allow us to avoid disabling interrupts or spinning. The design is based on Dmitry Vyukov's algorithm [1]. Performance-wise, ordinarily list-based queues aren't preferable to ringbuffers, because of cache misses when following pointers around. However, we *already* have to follow the adjacent pointers when working through fragments, so there shouldn't actually be any change there. A potential downside is that dequeueing is a bit more complicated, but the ptr_ring structure used prior had a spinlock when dequeueing, so all and all the difference appears to be a wash. Actually, from profiling, the biggest performance hit, by far, of this commit winds up being atomic_add_unless(count, 1, max) and atomic_ dec(count), which account for the majority of CPU time, according to perf. In that sense, the previous ring buffer was superior in that it could check if it was full by head==tail, which the list-based approach cannot do. But all and all, this enables us to get massive memory savings, allowing WireGuard to scale for real world deployments, without taking much of a performance hit. [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue Reviewed-by: Dmitry Vyukov Reviewed-by: Toke Høiland-Jørgensen Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 12 ++--- drivers/net/wireguard/device.h | 15 +++--- drivers/net/wireguard/peer.c | 28 ++++------- drivers/net/wireguard/peer.h | 4 +- drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++------- drivers/net/wireguard/queueing.h | 45 ++++++++++++----- drivers/net/wireguard/receive.c | 16 +++--- drivers/net/wireguard/send.c | 31 ++++-------- 8 files changed, 144 insertions(+), 93 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 8502e1b083ff..551ddaaaf540 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -235,8 +235,8 @@ static void wg_destruct(struct net_device *dev) destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); - wg_packet_queue_free(&wg->decrypt_queue, true); - wg_packet_queue_free(&wg->encrypt_queue, true); + wg_packet_queue_free(&wg->decrypt_queue); + wg_packet_queue_free(&wg->encrypt_queue); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); @@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, goto err_destroy_handshake_send; ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, - true, MAX_QUEUED_PACKETS); + MAX_QUEUED_PACKETS); if (ret < 0) goto err_destroy_packet_crypt; ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, - true, MAX_QUEUED_PACKETS); + MAX_QUEUED_PACKETS); if (ret < 0) goto err_free_encrypt_queue; @@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, err_uninit_ratelimiter: wg_ratelimiter_uninit(); err_free_decrypt_queue: - wg_packet_queue_free(&wg->decrypt_queue, true); + wg_packet_queue_free(&wg->decrypt_queue); err_free_encrypt_queue: - wg_packet_queue_free(&wg->encrypt_queue, true); + wg_packet_queue_free(&wg->encrypt_queue); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: diff --git a/drivers/net/wireguard/device.h b/drivers/net/wireguard/device.h index 4d0144e16947..854bc3d97150 100644 --- a/drivers/net/wireguard/device.h +++ b/drivers/net/wireguard/device.h @@ -27,13 +27,14 @@ struct multicore_worker { struct crypt_queue { struct ptr_ring ring; - union { - struct { - struct multicore_worker __percpu *worker; - int last_cpu; - }; - struct work_struct work; - }; + struct multicore_worker __percpu *worker; + int last_cpu; +}; + +struct prev_queue { + struct sk_buff *head, *tail, *peeked; + struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff. + atomic_t count; }; struct wg_device { diff --git a/drivers/net/wireguard/peer.c b/drivers/net/wireguard/peer.c index b3b6370e6b95..cd5cb0292cb6 100644 --- a/drivers/net/wireguard/peer.c +++ b/drivers/net/wireguard/peer.c @@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg_device *wg, peer = kzalloc(sizeof(*peer), GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); - peer->device = wg; + if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) + goto err; + peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); - if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) - goto err_1; - if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false, - MAX_QUEUED_PACKETS)) - goto err_2; - if (wg_packet_queue_init(&peer->rx_queue, NULL, false, - MAX_QUEUED_PACKETS)) - goto err_3; - peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); - INIT_WORK(&peer->transmit_handshake_work, - wg_packet_handshake_send_worker); + INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); + INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); + wg_prev_queue_init(&peer->tx_queue); + wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); @@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg_device *wg, pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; -err_3: - wg_packet_queue_free(&peer->tx_queue, false); -err_2: - dst_cache_destroy(&peer->endpoint_cache); -err_1: +err: kfree(peer); return ERR_PTR(ret); } @@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head *rcu) struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); - wg_packet_queue_free(&peer->rx_queue, false); - wg_packet_queue_free(&peer->tx_queue, false); + WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. diff --git a/drivers/net/wireguard/peer.h b/drivers/net/wireguard/peer.h index aaff8de6e34b..8d53b687a1d1 100644 --- a/drivers/net/wireguard/peer.h +++ b/drivers/net/wireguard/peer.h @@ -36,7 +36,7 @@ struct endpoint { struct wg_peer { struct wg_device *device; - struct crypt_queue tx_queue, rx_queue; + struct prev_queue tx_queue, rx_queue; struct sk_buff_head staged_packet_queue; int serial_work_cpu; bool is_dead; @@ -46,7 +46,7 @@ struct wg_peer { rwlock_t endpoint_lock; struct noise_handshake handshake; atomic64_t last_sent_handshake; - struct work_struct transmit_handshake_work, clear_peer_work; + struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work; struct cookie latest_cookie; struct hlist_node pubkey_hash; u64 rx_bytes, tx_bytes; diff --git a/drivers/net/wireguard/queueing.c b/drivers/net/wireguard/queueing.c index 71b8e80b58e1..48e7b982a307 100644 --- a/drivers/net/wireguard/queueing.c +++ b/drivers/net/wireguard/queueing.c @@ -9,8 +9,7 @@ struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) { int cpu; - struct multicore_worker __percpu *worker = - alloc_percpu(struct multicore_worker); + struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); if (!worker) return NULL; @@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) } int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, - bool multicore, unsigned int len) + unsigned int len) { int ret; @@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); if (ret) return ret; - if (function) { - if (multicore) { - queue->worker = wg_packet_percpu_multicore_worker_alloc( - function, queue); - if (!queue->worker) { - ptr_ring_cleanup(&queue->ring, NULL); - return -ENOMEM; - } - } else { - INIT_WORK(&queue->work, function); - } + queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); + if (!queue->worker) { + ptr_ring_cleanup(&queue->ring, NULL); + return -ENOMEM; } return 0; } -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore) +void wg_packet_queue_free(struct crypt_queue *queue) { - if (multicore) - free_percpu(queue->worker); + free_percpu(queue->worker); WARN_ON(!__ptr_ring_empty(&queue->ring)); ptr_ring_cleanup(&queue->ring, NULL); } + +#define NEXT(skb) ((skb)->prev) +#define STUB(queue) ((struct sk_buff *)&queue->empty) + +void wg_prev_queue_init(struct prev_queue *queue) +{ + NEXT(STUB(queue)) = NULL; + queue->head = queue->tail = STUB(queue); + queue->peeked = NULL; + atomic_set(&queue->count, 0); + BUILD_BUG_ON( + offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - + offsetof(struct prev_queue, empty) || + offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - + offsetof(struct prev_queue, empty)); +} + +static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) +{ + WRITE_ONCE(NEXT(skb), NULL); + WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); +} + +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) +{ + if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) + return false; + __wg_prev_queue_enqueue(queue, skb); + return true; +} + +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) +{ + struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); + + if (tail == STUB(queue)) { + if (!next) + return NULL; + queue->tail = next; + tail = next; + next = smp_load_acquire(&NEXT(next)); + } + if (next) { + queue->tail = next; + atomic_dec(&queue->count); + return tail; + } + if (tail != READ_ONCE(queue->head)) + return NULL; + __wg_prev_queue_enqueue(queue, STUB(queue)); + next = smp_load_acquire(&NEXT(tail)); + if (next) { + queue->tail = next; + atomic_dec(&queue->count); + return tail; + } + return NULL; +} + +#undef NEXT +#undef STUB diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index dfb674e03076..4ef2944a68bc 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -17,12 +17,13 @@ struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; +struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, - bool multicore, unsigned int len); -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); + unsigned int len); +void wg_packet_queue_free(struct crypt_queue *queue); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); @@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online(int *next) return cpu; } +void wg_prev_queue_init(struct prev_queue *queue); + +/* Multi producer */ +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); + +/* Single consumer */ +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); + +/* Single consumer */ +static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) +{ + if (queue->peeked) + return queue->peeked; + queue->peeked = wg_prev_queue_dequeue(queue); + return queue->peeked; +} + +/* Single consumer */ +static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) +{ + queue->peeked = NULL; +} + static inline int wg_queue_enqueue_per_device_and_peer( - struct crypt_queue *device_queue, struct crypt_queue *peer_queue, + struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) { int cpu; @@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_device_and_peer( /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; + /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ @@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_device_and_peer( return 0; } -static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, - struct sk_buff *skb, - enum packet_state state) +static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. @@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, - peer->internal_id), - peer->device->packet_crypt_wq, &queue->work); + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), + peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } -static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, - enum packet_state state) +static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 2c9551ea6dc7..7dc84bcca261 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -444,7 +444,6 @@ static void wg_packet_consume_data_done(struct wg_peer *peer, int wg_packet_rx_poll(struct napi_struct *napi, int budget) { struct wg_peer *peer = container_of(napi, struct wg_peer, napi); - struct crypt_queue *queue = &peer->rx_queue; struct noise_keypair *keypair; struct endpoint endpoint; enum packet_state state; @@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct *napi, int budget) if (unlikely(budget <= 0)) return 0; - while ((skb = __ptr_ring_peek(&queue->ring)) != NULL && + while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != PACKET_STATE_UNCRYPTED) { - __ptr_ring_discard_one(&queue->ring); - peer = PACKET_PEER(skb); + wg_prev_queue_drop_peeked(&peer->rx_queue); keypair = PACKET_CB(skb)->keypair; free = true; @@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct work_struct *work) enum packet_state state = likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; - wg_queue_enqueue_per_peer_napi(skb, state); + wg_queue_enqueue_per_peer_rx(skb, state); if (need_resched()) cond_resched(); } @@ -531,12 +529,10 @@ static void wg_packet_consume_data(struct wg_device *wg, struct sk_buff *skb) if (unlikely(READ_ONCE(peer->is_dead))) goto err; - ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, - &peer->rx_queue, skb, - wg->packet_crypt_wq, - &wg->decrypt_queue.last_cpu); + ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, + wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu); if (unlikely(ret == -EPIPE)) - wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD); + wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); if (likely(!ret || ret == -EPIPE)) { rcu_read_unlock_bh(); return; diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index f74b9341ab0f..5368f7c35b4b 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_peer *peer) wg_packet_send_staged_packets(peer); } -static void wg_packet_create_data_done(struct sk_buff *first, - struct wg_peer *peer) +static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first) { struct sk_buff *skb, *next; bool is_keepalive, data_sent = false; @@ -262,22 +261,19 @@ static void wg_packet_create_data_done(struct sk_buff *first, void wg_packet_tx_worker(struct work_struct *work) { - struct crypt_queue *queue = container_of(work, struct crypt_queue, - work); + struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work); struct noise_keypair *keypair; enum packet_state state; struct sk_buff *first; - struct wg_peer *peer; - while ((first = __ptr_ring_peek(&queue->ring)) != NULL && + while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(first)->state)) != PACKET_STATE_UNCRYPTED) { - __ptr_ring_discard_one(&queue->ring); - peer = PACKET_PEER(first); + wg_prev_queue_drop_peeked(&peer->tx_queue); keypair = PACKET_CB(first)->keypair; if (likely(state == PACKET_STATE_CRYPTED)) - wg_packet_create_data_done(first, peer); + wg_packet_create_data_done(peer, first); else kfree_skb_list(first); @@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct work_struct *work) break; } } - wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, - state); + wg_queue_enqueue_per_peer_tx(first, state); if (need_resched()) cond_resched(); } } -static void wg_packet_create_data(struct sk_buff *first) +static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) { - struct wg_peer *peer = PACKET_PEER(first); struct wg_device *wg = peer->device; int ret = -EINVAL; @@ -323,13 +317,10 @@ static void wg_packet_create_data(struct sk_buff *first) if (unlikely(READ_ONCE(peer->is_dead))) goto err; - ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, - &peer->tx_queue, first, - wg->packet_crypt_wq, - &wg->encrypt_queue.last_cpu); + ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first, + wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu); if (unlikely(ret == -EPIPE)) - wg_queue_enqueue_per_peer(&peer->tx_queue, first, - PACKET_STATE_DEAD); + wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD); err: rcu_read_unlock_bh(); if (likely(!ret || ret == -EPIPE)) @@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struct wg_peer *peer) packets.prev->next = NULL; wg_peer_get(keypair->entry.peer); PACKET_CB(packets.next)->keypair = keypair; - wg_packet_create_data(packets.next); + wg_packet_create_data(peer, packets.next); return; out_invalid: From bce2473927af8de12ad131a743f55d69d358c0b9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 22 Feb 2021 17:25:49 +0100 Subject: [PATCH 341/506] wireguard: kconfig: use arm chacha even with no neon The condition here was incorrect: a non-neon fallback implementation is available on arm32 when NEON is not supported. Reported-by: Ilya Lipnitskiy Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1ebb4b943876..4a01ffa4350a 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -88,7 +88,7 @@ config WIREGUARD select CRYPTO_CURVE25519_X86 if X86 && 64BIT select ARM_CRYPTO if ARM select ARM64_CRYPTO if ARM64 - select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON + select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON) select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON select CRYPTO_POLY1305_ARM if ARM select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON From db07562aeac77923370bff4733d8b0e09cbc93c4 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Feb 2021 14:01:25 -0800 Subject: [PATCH 342/506] Makefile: reuse CC_VERSION_TEXT I noticed we're invoking $(CC) via $(shell) more than once to check the version. Let's reuse the first string captured in $CC_VERSION_TEXT. Signed-off-by: Nick Desaulniers [masahiro.yamada: CC_VERSION_TEXT is assigned by = instead of :=, so this $(shell ) is evaluated multiple times anyway. The number of $(CC) invocations will be still the same. Replacing 'grep' with the built-in $(findstring ) will give real performance benefit.] Signed-off-by: Masahiro Yamada --- Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 1fdd44fe1659..6c3a2064072d 100644 --- a/Makefile +++ b/Makefile @@ -557,7 +557,13 @@ ifdef building_out_of_srctree { echo "# this is build directory, ignore it"; echo "*"; } > .gitignore endif -ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) +# The expansion should be delayed until arch/$(SRCARCH)/Makefile is included. +# Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. +# CC_VERSION_TEXT is referenced from Kconfig (so it needs export), +# and from include/config/auto.conf.cmd to detect the compiler upgrade. +CC_VERSION_TEXT = $(shell $(CC) --version 2>/dev/null | head -n 1) + +ifneq ($(findstring clang,$(CC_VERSION_TEXT)),) ifneq ($(CROSS_COMPILE),) CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit)) @@ -576,12 +582,6 @@ KBUILD_AFLAGS += $(CLANG_FLAGS) export CLANG_FLAGS endif -# The expansion should be delayed until arch/$(SRCARCH)/Makefile is included. -# Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. -# CC_VERSION_TEXT is referenced from Kconfig (so it needs export), -# and from include/config/auto.conf.cmd to detect the compiler upgrade. -CC_VERSION_TEXT = $(shell $(CC) --version 2>/dev/null | head -n 1) - ifdef config-build # =========================================================================== # *config targets only - make sure prerequisites are updated, and descend From c75173a26948363bdd11a0d5b90bd012ce4cc2e7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 16 Feb 2021 14:33:12 -0700 Subject: [PATCH 343/506] Makefile: Remove # characters from compiler string When using AMD's Optimizing C/C++ Compiler (AOCC), the build fails due to a # character in the version string, which is interpreted as a comment: $ make CC=clang defconfig init/main.o include/config/auto.conf.cmd:1374: *** invalid syntax in conditional. Stop. $ sed -n 1374p include/config/auto.conf.cmd ifneq "$(CC_VERSION_TEXT)" "AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)" Remove all # characters in the version string so that the build does not fail unexpectedly. Link: https://github.com/ClangBuiltLinux/linux/issues/1298 Reported-by: Michael Fuckner Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6c3a2064072d..4dafeae67f54 100644 --- a/Makefile +++ b/Makefile @@ -561,7 +561,7 @@ endif # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. # CC_VERSION_TEXT is referenced from Kconfig (so it needs export), # and from include/config/auto.conf.cmd to detect the compiler upgrade. -CC_VERSION_TEXT = $(shell $(CC) --version 2>/dev/null | head -n 1) +CC_VERSION_TEXT = $(shell $(CC) --version 2>/dev/null | head -n 1 | sed 's/\#//g') ifneq ($(findstring clang,$(CC_VERSION_TEXT)),) ifneq ($(CROSS_COMPILE),) From f82bd80d37ecc6ebda389473bd8414e89bbdbe05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 15 Feb 2021 19:15:09 +0100 Subject: [PATCH 344/506] kconfig: Remove duplicate call to sym_get_string_value() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the saved returned value of sym_get_string_value() instead of calling it twice. Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210215181511.2840674-2-mic@digikod.net Signed-off-by: Masahiro Yamada --- scripts/kconfig/conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index db03e2f45de4..18a233d27a8d 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -137,7 +137,7 @@ static int conf_string(struct menu *menu) printf("%*s%s ", indent - 1, "", menu->prompt->text); printf("(%s) ", sym->name); def = sym_get_string_value(sym); - if (sym_get_string_value(sym)) + if (def) printf("[%s] ", def); if (!conf_askvalue(sym, def)) return 0; From a4cff327d8533bde5bac147aaa8b09e8d835cab2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Feb 2021 18:26:22 +0900 Subject: [PATCH 345/506] kconfig: clean up nested if-conditionals in check_conf() Unify the outer two if-conditionals into one. This decreases the indent level by one. Also, change the if-else blocks: if (input_mode == listnewconfig) { ... } else if (input_mode == helpnewconfig) { ... } else { ... } into the switch statement. Signed-off-by: Masahiro Yamada --- scripts/kconfig/conf.c | 53 ++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 18a233d27a8d..369615d6c97e 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -419,34 +419,37 @@ static void check_conf(struct menu *menu) return; sym = menu->sym; - if (sym && !sym_has_value(sym)) { - if (sym_is_changeable(sym) || - (sym_is_choice(sym) && sym_get_tristate_value(sym) == yes)) { - if (input_mode == listnewconfig) { - if (sym->name) { - const char *str; + if (sym && !sym_has_value(sym) && + (sym_is_changeable(sym) || + (sym_is_choice(sym) && sym_get_tristate_value(sym) == yes))) { - if (sym->type == S_STRING) { - str = sym_get_string_value(sym); - str = sym_escape_string_value(str); - printf("%s%s=%s\n", CONFIG_, sym->name, str); - free((void *)str); - } else { - str = sym_get_string_value(sym); - printf("%s%s=%s\n", CONFIG_, sym->name, str); - } + switch (input_mode) { + case listnewconfig: + if (sym->name) { + const char *str; + + if (sym->type == S_STRING) { + str = sym_get_string_value(sym); + str = sym_escape_string_value(str); + printf("%s%s=%s\n", CONFIG_, sym->name, str); + free((void *)str); + } else { + str = sym_get_string_value(sym); + printf("%s%s=%s\n", CONFIG_, sym->name, str); } - } else if (input_mode == helpnewconfig) { - printf("-----\n"); - print_help(menu); - printf("-----\n"); - - } else { - if (!conf_cnt++) - printf("*\n* Restart config...\n*\n"); - rootEntry = menu_get_parent_menu(menu); - conf(rootEntry); } + break; + case helpnewconfig: + printf("-----\n"); + print_help(menu); + printf("-----\n"); + break; + default: + if (!conf_cnt++) + printf("*\n* Restart config...\n*\n"); + rootEntry = menu_get_parent_menu(menu); + conf(rootEntry); + break; } } From 102a1a72d0c80ffceae1e2a5d371699463c93733 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Feb 2021 18:26:23 +0900 Subject: [PATCH 346/506] kconfig: remove dead code in conf_askvalue() conf_askvalue() is only called for oldconfig, syncconfig, and oldaskconfig. If it is called for other cases, it is a bug. So, the code after the switch statement is unreachable. Remove the dead code, and clean up the switch statement. Signed-off-by: Masahiro Yamada --- scripts/kconfig/conf.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 369615d6c97e..3a98c9e0a7c8 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -84,8 +84,6 @@ static void xfgets(char *str, int size, FILE *in) static int conf_askvalue(struct symbol *sym, const char *def) { - enum symbol_type type = sym_get_type(sym); - if (!sym_has_value(sym)) printf("(NEW) "); @@ -107,24 +105,12 @@ static int conf_askvalue(struct symbol *sym, const char *def) return 0; } /* fall through */ - case oldaskconfig: + default: fflush(stdout); xfgets(line, sizeof(line), stdin); - return 1; - default: break; } - switch (type) { - case S_INT: - case S_HEX: - case S_STRING: - printf("%s\n", def); - return 1; - default: - ; - } - printf("%s", line); return 1; } From a2af62c3bd8fec5a2771be88c95783ddfcc57631 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Feb 2021 22:03:16 +0900 Subject: [PATCH 347/506] kconfig: fix 'invalid option' for help option scripts/kconfig/conf supports -? option to show the help message. This is not wired up to Makefile, so nobody would notice this, but it also shows 'invalid option' message. $ ./scripts/kconfig/conf -? ./scripts/kconfig/conf: invalid option -- '?' Usage: ./scripts/kconfig/conf [-s] [option] [option] is _one_ of the following: --listnewconfig List new options --helpnewconfig List new options and help text --oldaskconfig Start a new configuration using a line-oriented program ... The reason is the '?' is missing in the short option list passed to getopt_long(). While I fixed this issue, I also changed the option '?' to 'h'. I prefer -h (or --help, if a long option is also desired). Signed-off-by: Masahiro Yamada --- scripts/kconfig/conf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 3a98c9e0a7c8..37e17934b67a 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -494,7 +494,7 @@ int main(int ac, char **av) tty_stdio = isatty(0) && isatty(1); - while ((opt = getopt_long(ac, av, "s", long_opts, NULL)) != -1) { + while ((opt = getopt_long(ac, av, "hs", long_opts, NULL)) != -1) { if (opt == 's') { conf_set_message_callback(NULL); continue; @@ -550,7 +550,7 @@ int main(int ac, char **av) case yes2modconfig: case mod2yesconfig: break; - case '?': + case 'h': conf_usage(progname); exit(1); break; From ae8da72bde7a3fb5c756fa34506196fe190c3204 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Feb 2021 22:03:17 +0900 Subject: [PATCH 348/506] kconfig: omit --oldaskconfig option for 'make config' scripts/kconfig/conf.c line 39 defines the default of input_mode as oldaskconfig. Hence, 'make config' works in the same way even without the --oldaskconfig option given. Note this in the help message. This will be helpful to unify build rules in Makefile in the next commit. Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 2 +- scripts/kconfig/conf.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 2c40e68853dd..5180a71c931f 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -29,7 +29,7 @@ menuconfig: $(obj)/mconf $(Q)$< $(silent) $(Kconfig) config: $(obj)/conf - $(Q)$< $(silent) --oldaskconfig $(Kconfig) + $(Q)$< $(silent) $(Kconfig) nconfig: $(obj)/nconf $(Q)$< $(silent) $(Kconfig) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 37e17934b67a..957d2a0832f7 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -483,6 +483,7 @@ static void conf_usage(const char *progname) printf(" --randconfig New config with random answer to all options\n"); printf(" --yes2modconfig Change answers from yes to mod if possible\n"); printf(" --mod2yesconfig Change answers from mod to yes if possible\n"); + printf(" (If none of the above is given, --oldaskconfig is the default)\n"); } int main(int ac, char **av) From f91e46b1a722082a5eabcd230d0dfcc6cff3c384 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Feb 2021 22:03:18 +0900 Subject: [PATCH 349/506] kconfig: unify rule of config, menuconfig, nconfig, gconfig, xconfig Unify the similar build rules. This supports 'make build_config', which builds scripts/kconfig/conf but does not invoke it. Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 5180a71c931f..8c19b82c6035 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -3,9 +3,6 @@ # Kernel configuration targets # These targets are used from top-level makefile -PHONY += xconfig gconfig menuconfig config localmodconfig localyesconfig \ - build_menuconfig build_nconfig build_gconfig build_xconfig - ifdef KBUILD_KCONFIG Kconfig := $(KBUILD_KCONFIG) else @@ -19,29 +16,24 @@ endif # We need this, in case the user has it in its environment unexport CONFIG_ -xconfig: $(obj)/qconf - $(Q)$< $(silent) $(Kconfig) +config-prog := conf +menuconfig-prog := mconf +nconfig-prog := nconf +gconfig-prog := gconf +xconfig-prog := qconf -gconfig: $(obj)/gconf - $(Q)$< $(silent) $(Kconfig) +define config_rule +PHONY += $(1) +$(1): $(obj)/$($(1)-prog) + $(Q)$$< $(silent) $(Kconfig) -menuconfig: $(obj)/mconf - $(Q)$< $(silent) $(Kconfig) +PHONY += build_$(1) +build_$(1): $(obj)/$($(1)-prog) +endef -config: $(obj)/conf - $(Q)$< $(silent) $(Kconfig) - -nconfig: $(obj)/nconf - $(Q)$< $(silent) $(Kconfig) - -build_menuconfig: $(obj)/mconf - -build_nconfig: $(obj)/nconf - -build_gconfig: $(obj)/gconf - -build_xconfig: $(obj)/qconf +$(foreach c, config menuconfig nconfig gconfig xconfig, $(eval $(call config_rule,$(c)))) +PHONY += localmodconfig localyesconfig localyesconfig localmodconfig: $(obj)/conf $(Q)$(PERL) $(srctree)/$(src)/streamline_config.pl --$@ $(srctree) $(Kconfig) > .tmp.config $(Q)if [ -f .config ]; then \ From 30cef68d2d19e48c5832b126d3f4a7aeae5a64d6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 22 Feb 2021 01:50:19 +0900 Subject: [PATCH 350/506] kbuild: reuse this-makefile to define abs_srctree Move this-makefile up, and reuse it to define abs_srctree. Signed-off-by: Masahiro Yamada --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 4dafeae67f54..667602a74a05 100644 --- a/Makefile +++ b/Makefile @@ -145,7 +145,8 @@ else need-sub-make := 1 endif -abs_srctree := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))) +this-makefile := $(lastword $(MAKEFILE_LIST)) +abs_srctree := $(realpath $(dir $(this-makefile))) ifneq ($(words $(subst :, ,$(abs_srctree))), 1) $(error source directory cannot contain spaces or colons) @@ -160,8 +161,6 @@ MAKEFLAGS += --include-dir=$(abs_srctree) need-sub-make := 1 endif -this-makefile := $(lastword $(MAKEFILE_LIST)) - ifneq ($(filter 3.%,$(MAKE_VERSION)),) # 'MAKEFLAGS += -rR' does not immediately become effective for GNU Make 3.x # We need to invoke sub-make to avoid implicit rules in the top Makefile. From bcf637f54f6d2515d4c9c81808faf01848916152 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 22 Feb 2021 01:53:06 +0900 Subject: [PATCH 351/506] kbuild: parse C= and M= before changing the working directory If Kbuild recurses to the top Makefile (for example, 'make deb-pkg'), C= and M= are parsed over again, needlessly. Parse them before changing the working directory. After that, sub_make_done is set to 1, so they are parsed just once. Signed-off-by: Masahiro Yamada --- Makefile | 61 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index 667602a74a05..c934feffc33a 100644 --- a/Makefile +++ b/Makefile @@ -100,6 +100,36 @@ endif export quiet Q KBUILD_VERBOSE +# Call a source code checker (by default, "sparse") as part of the +# C compilation. +# +# Use 'make C=1' to enable checking of only re-compiled files. +# Use 'make C=2' to enable checking of *all* source files, regardless +# of whether they are re-compiled or not. +# +# See the file "Documentation/dev-tools/sparse.rst" for more details, +# including where to get the "sparse" utility. + +ifeq ("$(origin C)", "command line") + KBUILD_CHECKSRC = $(C) +endif +ifndef KBUILD_CHECKSRC + KBUILD_CHECKSRC = 0 +endif + +export KBUILD_CHECKSRC + +# Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the +# directory of external module to build. Setting M= takes precedence. +ifeq ("$(origin M)", "command line") + KBUILD_EXTMOD := $(M) +endif + +$(if $(word 2, $(KBUILD_EXTMOD)), \ + $(error building multiple external modules is not supported)) + +export KBUILD_EXTMOD + # Kbuild will save output files in the current working directory. # This does not need to match to the root of the kernel source tree. # @@ -194,36 +224,6 @@ ifeq ($(need-sub-make),) # so that IDEs/editors are able to understand relative filenames. MAKEFLAGS += --no-print-directory -# Call a source code checker (by default, "sparse") as part of the -# C compilation. -# -# Use 'make C=1' to enable checking of only re-compiled files. -# Use 'make C=2' to enable checking of *all* source files, regardless -# of whether they are re-compiled or not. -# -# See the file "Documentation/dev-tools/sparse.rst" for more details, -# including where to get the "sparse" utility. - -ifeq ("$(origin C)", "command line") - KBUILD_CHECKSRC = $(C) -endif -ifndef KBUILD_CHECKSRC - KBUILD_CHECKSRC = 0 -endif - -# Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the -# directory of external module to build. Setting M= takes precedence. -ifeq ("$(origin M)", "command line") - KBUILD_EXTMOD := $(M) -endif - -$(if $(word 2, $(KBUILD_EXTMOD)), \ - $(error building multiple external modules is not supported)) - -export KBUILD_CHECKSRC KBUILD_EXTMOD - -extmod-prefix = $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD)/) - ifeq ($(abs_srctree),$(abs_objtree)) # building in the source tree srctree := . @@ -1093,6 +1093,7 @@ endif # CONFIG_BPF PHONY += prepare0 +extmod-prefix = $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD)/) export MODORDER := $(extmod-prefix)modules.order export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps From b97652bf10f1d3b0f1ca536377e92f99acfb2fcd Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 22 Feb 2021 01:53:56 +0900 Subject: [PATCH 352/506] kbuild: remove deprecated 'always' and 'hostprogs-y/m' These have no more user in the upstream code. The use of them has been warned for a while for external modules. The migration is finished. Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 1 - scripts/Makefile.clean | 3 --- scripts/Makefile.lib | 12 ------------ 3 files changed, 16 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 4c058f12dd73..3a07c46caa3e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -15,7 +15,6 @@ obj-y := obj-m := lib-y := lib-m := -always := always-y := always-m := targets := diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean index d9e0ceace6a6..22a8172bce1f 100644 --- a/scripts/Makefile.clean +++ b/scripts/Makefile.clean @@ -34,9 +34,6 @@ __clean-files := \ $(hostprogs-always-y) $(hostprogs-always-m) $(hostprogs-always-) \ $(userprogs-always-y) $(userprogs-always-m) $(userprogs-always-) -# deprecated -__clean-files += $(always) $(hostprogs-y) $(hostprogs-m) $(hostprogs-) - __clean-files := $(filter-out $(no-clean-files), $(__clean-files)) # clean-files is given relative to the current directory, unless it diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 6f248ff91982..d5ec3864b318 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -4,18 +4,6 @@ asflags-y += $(EXTRA_AFLAGS) ccflags-y += $(EXTRA_CFLAGS) cppflags-y += $(EXTRA_CPPFLAGS) ldflags-y += $(EXTRA_LDFLAGS) -ifneq ($(always),) -$(warning 'always' is deprecated. Please use 'always-y' instead) -always-y += $(always) -endif -ifneq ($(hostprogs-y),) -$(warning 'hostprogs-y' is deprecated. Please use 'hostprogs' instead) -hostprogs += $(hostprogs-y) -endif -ifneq ($(hostprogs-m),) -$(warning 'hostprogs-m' is deprecated. Please use 'hostprogs' instead) -hostprogs += $(hostprogs-m) -endif # flags that take effect in current and sub directories KBUILD_AFLAGS += $(subdir-asflags-y) From 481083ec0bfc14c15f00fbe87c7b06dc01091950 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Feb 2021 13:54:28 +0100 Subject: [PATCH 353/506] initramfs: Remove redundant dependency of RD_ZSTD on BLK_DEV_INITRD Commit be1859bdc660 ("initramfs: remove redundant dependency on BLK_DEV_INITRD") removed all redundant dependencies on BLK_DEV_INITRD, but the recent addition of zstd support introduced a new one. Fixes: a30d8a39f057 ("usr: Add support for zstd compressed initramfs") Signed-off-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada --- usr/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/usr/Kconfig b/usr/Kconfig index 2599bc21c1b2..8bbcf699fe3b 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -103,7 +103,6 @@ config RD_LZ4 config RD_ZSTD bool "Support initial ramdisk/ramfs compressed using ZSTD" default y - depends on BLK_DEV_INITRD select DECOMPRESS_ZSTD help Support loading of a ZSTD encoded initial ramdisk or cpio buffer. From 610e4dc8ac463815f5180ae2e6fadae834891b86 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Mon, 22 Feb 2021 16:49:56 +0000 Subject: [PATCH 354/506] KVM: arm64: make the hyp vector table entries local Make the hyp vector table entries local functions so they are not accidentally referred to outside of this file. Using SYM_CODE_START_LOCAL matches the other vector tables (in hyp-stub.S, hibernate-asm.S and entry.S) Signed-off-by: Joey Gouly Acked-by: Will Deacon Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20210222164956.43514-1-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/kvm/hyp/hyp-entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index d179056e1af8..5f49df4ffdd8 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -119,7 +119,7 @@ el2_error: .macro invalid_vector label, target = __guest_exit_panic .align 2 -SYM_CODE_START(\label) +SYM_CODE_START_LOCAL(\label) b \target SYM_CODE_END(\label) .endm From f1b6cff7c98be2747d2fe16e42dcdcf2fc02c7e6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 24 Feb 2021 09:37:36 +0000 Subject: [PATCH 355/506] arm64: VHE: Enable EL2 MMU from the idmap Enabling the MMU requires the write to SCTLR_ELx (and the ISB that follows) to live in some identity-mapped memory. Otherwise, the translation will result in something totally unexpected (either fetching the wrong instruction stream, or taking a fault of some sort). This is exactly what happens in mutate_to_vhe(), as this code lives in the .hyp.text section, which isn't identity-mapped. With the right configuration, this explodes badly. Extract the MMU-enabling part of mutate_to_vhe(), and move it to its own function that lives in the idmap. This ensures nothing bad happens. Fixes: f359182291c7 ("arm64: Provide an 'upgrade to VHE' stub hypercall") Reported-by: "kernelci.org bot" Tested-by: Guillaume Tucker Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210224093738.3629662-2-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 39 ++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 678cd2c618ee..ae56787ea7c1 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -75,9 +75,6 @@ SYM_CODE_END(el1_sync) // nVHE? No way! Give me the real thing! SYM_CODE_START_LOCAL(mutate_to_vhe) - // Be prepared to fail - mov_q x0, HVC_STUB_ERR - // Sanity check: MMU *must* be off mrs x1, sctlr_el2 tbnz x1, #0, 1f @@ -96,8 +93,11 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) cmp x1, xzr and x2, x2, x1 csinv x2, x2, xzr, ne - cbz x2, 1f + cbnz x2, 2f +1: mov_q x0, HVC_STUB_ERR + eret +2: // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS msr hcr_el2, x0 @@ -131,6 +131,24 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) msr mair_el1, x0 isb + // Hack the exception return to stay at EL2 + mrs x0, spsr_el1 + and x0, x0, #~PSR_MODE_MASK + mov x1, #PSR_MODE_EL2h + orr x0, x0, x1 + msr spsr_el1, x0 + + b enter_vhe +SYM_CODE_END(mutate_to_vhe) + + // At the point where we reach enter_vhe(), we run with + // the MMU off (which is enforced by mutate_to_vhe()). + // We thus need to be in the idmap, or everything will + // explode when enabling the MMU. + + .pushsection .idmap.text, "ax" + +SYM_CODE_START_LOCAL(enter_vhe) // Invalidate TLBs before enabling the MMU tlbi vmalle1 dsb nsh @@ -143,17 +161,12 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr_s SYS_SCTLR_EL12, x0 - // Hack the exception return to stay at EL2 - mrs x0, spsr_el1 - and x0, x0, #~PSR_MODE_MASK - mov x1, #PSR_MODE_EL2h - orr x0, x0, x1 - msr spsr_el1, x0 - mov x0, xzr -1: eret -SYM_CODE_END(mutate_to_vhe) + eret +SYM_CODE_END(enter_vhe) + + .popsection .macro invalid_vector label SYM_CODE_START_LOCAL(\label) From 9d41053e8dc115c92b8002c3db5f545d7602498b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 24 Feb 2021 09:37:37 +0000 Subject: [PATCH 356/506] arm64: Add missing ISB after invalidating TLB in __primary_switch Although there has been a bit of back and forth on the subject, it appears that invalidating TLBs requires an ISB instruction when FEAT_ETS is not implemented by the CPU. From the bible: | In an implementation that does not implement FEAT_ETS, a TLB | maintenance instruction executed by a PE, PEx, can complete at any | time after it is issued, but is only guaranteed to be finished for a | PE, PEx, after the execution of DSB by the PEx followed by a Context | synchronization event Add the missing ISB in __primary_switch, just in case. Fixes: 3c5e9f238bc4 ("arm64: head.S: move KASLR processing out of __enable_mmu()") Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210224093738.3629662-3-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 1e30b5550d2a..66b0e0b66e31 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -837,6 +837,7 @@ SYM_FUNC_START_LOCAL(__primary_switch) tlbi vmalle1 // Remove any stale TLB entries dsb nsh + isb set_sctlr_el1 x19 // re-enable the MMU From 430251cc864beb11ac5b6d2f5c6ef54ddd432612 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 24 Feb 2021 09:37:38 +0000 Subject: [PATCH 357/506] arm64: Add missing ISB after invalidating TLB in enter_vhe Although there has been a bit of back and forth on the subject, it appears that invalidating TLBs requires an ISB instruction after the TLBI/DSB sequence when FEAT_ETS is not implemented by the CPU. From the bible: | In an implementation that does not implement FEAT_ETS, a TLB | maintenance instruction executed by a PE, PEx, can complete at any | time after it is issued, but is only guaranteed to be finished for a | PE, PEx, after the execution of DSB by the PEx followed by a Context | synchronization event Add the missing ISB in enter_vhe(), just in case. Fixes: f359182291c7 ("arm64: Provide an 'upgrade to VHE' stub hypercall") Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210224093738.3629662-4-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index ae56787ea7c1..5eccbd62fec8 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -152,6 +152,7 @@ SYM_CODE_START_LOCAL(enter_vhe) // Invalidate TLBs before enabling the MMU tlbi vmalle1 dsb nsh + isb // Enable the EL2 S1 MMU, as set up from EL1 mrs_s x0, SYS_SCTLR_EL12 From 21f05a437e96d485180f33294757b14cfcf338d2 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 11 Feb 2021 21:17:00 +0100 Subject: [PATCH 358/506] ACPI: platform: Hide ACPI_PLATFORM_PROFILE option The ACPI_PLATFORM_PROFILE option essentially provides a library and not really an independent module. Thus it seems to be more user-friendly to hide this option and simply make drivers depending on it select it. Suggested-by: Hans de Goede Signed-off-by: Maximilian Luz Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 16 +--------------- drivers/platform/x86/Kconfig | 4 ++-- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index fc06945d3f99..7265173689ce 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -327,21 +327,7 @@ config ACPI_THERMAL the module will be called thermal. config ACPI_PLATFORM_PROFILE - tristate "ACPI Platform Profile Driver" - default m - help - This driver adds support for platform-profiles on platforms that - support it. - - Platform-profiles can be used to control the platform behaviour. For - example whether to operate in a lower power mode, in a higher - power performance mode or between the two. - - This driver provides the sysfs interface and is used as the registration - point for platform specific drivers. - - Which profiles are supported is determined on a per-platform basis and - should be obtained from the platform specific driver. + tristate config ACPI_CUSTOM_DSDT_FILE string "Custom DSDT Table file to include" diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 56353e8c792a..ad4e630e73e2 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -450,7 +450,7 @@ config IDEAPAD_LAPTOP depends on BACKLIGHT_CLASS_DEVICE depends on ACPI_VIDEO || ACPI_VIDEO = n depends on ACPI_WMI || ACPI_WMI = n - depends on ACPI_PLATFORM_PROFILE + select ACPI_PLATFORM_PROFILE select INPUT_SPARSEKMAP select NEW_LEDS select LEDS_CLASS @@ -484,7 +484,7 @@ config THINKPAD_ACPI depends on RFKILL || RFKILL = n depends on ACPI_VIDEO || ACPI_VIDEO = n depends on BACKLIGHT_CLASS_DEVICE - depends on ACPI_PLATFORM_PROFILE + select ACPI_PLATFORM_PROFILE select HWMON select NVRAM select NEW_LEDS From 6120484ef2bd4ffea7d2f11d2f06167b8f848349 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 11 Feb 2021 21:17:01 +0100 Subject: [PATCH 359/506] ACPI: platform: Fix file references in comment The referenced files are named slightly different. Replace '-' with '_' and drop the .rst ending. Signed-off-by: Maximilian Luz Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- include/linux/platform_profile.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index a26542d53058..b4794027e759 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -12,9 +12,8 @@ #include /* - * If more options are added please update profile_names - * array in platform-profile.c and sysfs-platform-profile.rst - * documentation. + * If more options are added please update profile_names array in + * platform_profile.c and sysfs-platform_profile documentation. */ enum platform_profile_option { From 6c0b5e3fc6b536b125a66dfee103f3bc26d386f6 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 11 Feb 2021 21:17:02 +0100 Subject: [PATCH 360/506] ACPI: platform: Add balanced-performance platform profile Some devices, including most Microsoft Surface devices, have a platform profile somewhere inbetween balanced and performance. More specifically, adding this profile allows the following mapping on Surface devices: Vendor Name Platform Profile ------------------------------------------ Battery Saver low-power Recommended balanced Better Performance balanced-performance Best Performance performance Suggested-by: Hans de Goede Signed-off-by: Maximilian Luz Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-platform_profile | 18 +++++++++++------- drivers/acpi/platform_profile.c | 1 + include/linux/platform_profile.h | 1 + 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-platform_profile b/Documentation/ABI/testing/sysfs-platform_profile index 9d6b89b66cca..dae9c8941905 100644 --- a/Documentation/ABI/testing/sysfs-platform_profile +++ b/Documentation/ABI/testing/sysfs-platform_profile @@ -5,13 +5,17 @@ Description: This file contains a space-separated list of profiles supported for Drivers must use the following standard profile-names: - ============ ============================================ - low-power Low power consumption - cool Cooler operation - quiet Quieter operation - balanced Balance between low power consumption and performance - performance High performance operation - ============ ============================================ + ==================== ======================================== + low-power Low power consumption + cool Cooler operation + quiet Quieter operation + balanced Balance between low power consumption + and performance + balanced-performance Balance between performance and low + power consumption with a slight bias + towards performance + performance High performance operation + ==================== ======================================== Userspace may expect drivers to offer more than one of these standard profile names. diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index 4a59c5993bde..dd2fbf38e414 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -17,6 +17,7 @@ static const char * const profile_names[] = { [PLATFORM_PROFILE_COOL] = "cool", [PLATFORM_PROFILE_QUIET] = "quiet", [PLATFORM_PROFILE_BALANCED] = "balanced", + [PLATFORM_PROFILE_BALANCED_PERFORMANCE] = "balanced-performance", [PLATFORM_PROFILE_PERFORMANCE] = "performance", }; static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST); diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index b4794027e759..a6329003aee7 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -21,6 +21,7 @@ enum platform_profile_option { PLATFORM_PROFILE_COOL, PLATFORM_PROFILE_QUIET, PLATFORM_PROFILE_BALANCED, + PLATFORM_PROFILE_BALANCED_PERFORMANCE, PLATFORM_PROFILE_PERFORMANCE, PLATFORM_PROFILE_LAST, /*must always be last */ }; From b092b19602cfd47de1eeeb3a1b03822afd86b136 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Mon, 22 Feb 2021 14:04:12 +0800 Subject: [PATCH 361/506] drm/amdgpu: fix shutdown and poweroff process failed with s0ix In the shutdown and poweroff opt on the s0i3 system we still need un-gate the gfx clock gating and power gating before destory amdgpu device. Fixes: 628c36d7b238e2 ("drm/amdgpu: update amdgpu device suspend/resume sequence for s0i3 support") Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1499 Signed-off-by: Alex Deucher Signed-off-by: Prike Liang Reviewed-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 ++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 86452f4f3c82..b6879d97c9c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1008,6 +1008,12 @@ struct amdgpu_device { bool in_suspend; bool in_hibernate; + /* + * The combination flag in_poweroff_reboot_com used to identify the poweroff + * and reboot opt in the s0i3 system-wide suspend. + */ + bool in_poweroff_reboot_com; + atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7052dc35d278..6447cd6ca5a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2678,7 +2678,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); } @@ -3741,7 +3742,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_fence_driver_suspend(adev); - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) r = amdgpu_device_ip_suspend_phase2(adev); else amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 03c529630c21..4575192d9b08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1270,7 +1270,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) */ if (!amdgpu_passthrough(adev)) adev->mp1_state = PP_MP1_STATE_UNLOAD; + adev->in_poweroff_reboot_com = true; amdgpu_device_ip_suspend(adev); + adev->in_poweroff_reboot_com = false; adev->mp1_state = PP_MP1_STATE_NONE; } @@ -1312,8 +1314,13 @@ static int amdgpu_pmops_thaw(struct device *dev) static int amdgpu_pmops_poweroff(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; - return amdgpu_device_suspend(drm_dev, true); + adev->in_poweroff_reboot_com = true; + r = amdgpu_device_suspend(drm_dev, true); + adev->in_poweroff_reboot_com = false; + return r; } static int amdgpu_pmops_restore(struct device *dev) From ea3b4242bc9ca197762119382b37e125815bd67f Mon Sep 17 00:00:00 2001 From: Qingqing Zhuo Date: Tue, 9 Feb 2021 16:36:41 -0500 Subject: [PATCH 362/506] drm/amd/display: Fix system hang after multiple hotplugs (v3) [Why] mutex_lock() was introduced in dm_disable_vblank(), which could be called in an IRQ context. Waiting in IRQ would cause issues like kernel lockup, etc. [How] Handle code that requires mutex lock on a different thread. v2: squash in compilation fix without CONFIG_DRM_AMD_DC_DCN (Alex) v3: squash in warning fix (Wei) Signed-off-by: Qingqing Zhuo Acked-by: Bindu Ramamurthy Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 78 +++++++++++++++---- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 27 +++++++ 2 files changed, 92 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 344404c4ac75..3e1fd1e7d09f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -937,7 +937,49 @@ static void mmhub_read_system_context(struct amdgpu_device *adev, struct dc_phy_ } #endif +#if defined(CONFIG_DRM_AMD_DC_DCN) +static void event_mall_stutter(struct work_struct *work) +{ + struct vblank_workqueue *vblank_work = container_of(work, struct vblank_workqueue, mall_work); + struct amdgpu_display_manager *dm = vblank_work->dm; + + mutex_lock(&dm->dc_lock); + + if (vblank_work->enable) + dm->active_vblank_irq_count++; + else + dm->active_vblank_irq_count--; + + + dc_allow_idle_optimizations( + dm->dc, dm->active_vblank_irq_count == 0 ? true : false); + + DRM_DEBUG_DRIVER("Allow idle optimizations (MALL): %d\n", dm->active_vblank_irq_count == 0); + + + mutex_unlock(&dm->dc_lock); +} + +static struct vblank_workqueue *vblank_create_workqueue(struct amdgpu_device *adev, struct dc *dc) +{ + + int max_caps = dc->caps.max_links; + struct vblank_workqueue *vblank_work; + int i = 0; + + vblank_work = kcalloc(max_caps, sizeof(*vblank_work), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(vblank_work)) { + kfree(vblank_work); + return NULL; + } + + for (i = 0; i < max_caps; i++) + INIT_WORK(&vblank_work[i].mall_work, event_mall_stutter); + + return vblank_work; +} +#endif static int amdgpu_dm_init(struct amdgpu_device *adev) { struct dc_init_data init_data; @@ -957,6 +999,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) mutex_init(&adev->dm.dc_lock); mutex_init(&adev->dm.audio_lock); +#if defined(CONFIG_DRM_AMD_DC_DCN) + spin_lock_init(&adev->dm.vblank_lock); +#endif if(amdgpu_dm_irq_init(adev)) { DRM_ERROR("amdgpu: failed to initialize DM IRQ support.\n"); @@ -1071,6 +1116,17 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) amdgpu_dm_init_color_mod(); +#if defined(CONFIG_DRM_AMD_DC_DCN) + if (adev->dm.dc->caps.max_links > 0) { + adev->dm.vblank_workqueue = vblank_create_workqueue(adev, adev->dm.dc); + + if (!adev->dm.vblank_workqueue) + DRM_ERROR("amdgpu: failed to initialize vblank_workqueue.\n"); + else + DRM_DEBUG_DRIVER("amdgpu: vblank_workqueue init done %p.\n", adev->dm.vblank_workqueue); + } +#endif + #ifdef CONFIG_DRM_AMD_DC_HDCP if (adev->dm.dc->caps.max_links > 0 && adev->asic_type >= CHIP_RAVEN) { adev->dm.hdcp_workqueue = hdcp_create_workqueue(adev, &init_params.cp_psp, adev->dm.dc); @@ -5375,7 +5431,10 @@ static inline int dm_set_vblank(struct drm_crtc *crtc, bool enable) struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc); struct amdgpu_device *adev = drm_to_adev(crtc->dev); struct dm_crtc_state *acrtc_state = to_dm_crtc_state(crtc->state); +#if defined(CONFIG_DRM_AMD_DC_DCN) struct amdgpu_display_manager *dm = &adev->dm; + unsigned long flags; +#endif int rc = 0; if (enable) { @@ -5398,22 +5457,15 @@ static inline int dm_set_vblank(struct drm_crtc *crtc, bool enable) if (amdgpu_in_reset(adev)) return 0; - mutex_lock(&dm->dc_lock); - - if (enable) - dm->active_vblank_irq_count++; - else - dm->active_vblank_irq_count--; - #if defined(CONFIG_DRM_AMD_DC_DCN) - dc_allow_idle_optimizations( - adev->dm.dc, dm->active_vblank_irq_count == 0 ? true : false); - - DRM_DEBUG_DRIVER("Allow idle optimizations (MALL): %d\n", dm->active_vblank_irq_count == 0); + spin_lock_irqsave(&dm->vblank_lock, flags); + dm->vblank_workqueue->dm = dm; + dm->vblank_workqueue->otg_inst = acrtc->otg_inst; + dm->vblank_workqueue->enable = enable; + spin_unlock_irqrestore(&dm->vblank_lock, flags); + schedule_work(&dm->vblank_workqueue->mall_work); #endif - mutex_unlock(&dm->dc_lock); - return 0; } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index f72930c36c22..8bfe901cf237 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -92,6 +92,20 @@ struct dm_compressor_info { uint64_t gpu_addr; }; +/** + * struct vblank_workqueue - Works to be executed in a separate thread during vblank + * @mall_work: work for mall stutter + * @dm: amdgpu display manager device + * @otg_inst: otg instance of which vblank is being set + * @enable: true if enable vblank + */ +struct vblank_workqueue { + struct work_struct mall_work; + struct amdgpu_display_manager *dm; + int otg_inst; + bool enable; +}; + /** * struct amdgpu_dm_backlight_caps - Information about backlight * @@ -243,6 +257,15 @@ struct amdgpu_display_manager { */ struct mutex audio_lock; + /** + * @vblank_work_lock: + * + * Guards access to deferred vblank work state. + */ +#if defined(CONFIG_DRM_AMD_DC_DCN) + spinlock_t vblank_lock; +#endif + /** * @audio_component: * @@ -321,6 +344,10 @@ struct amdgpu_display_manager { struct hdcp_workqueue *hdcp_workqueue; #endif +#if defined(CONFIG_DRM_AMD_DC_DCN) + struct vblank_workqueue *vblank_workqueue; +#endif + struct drm_atomic_state *cached_state; struct dc_state *cached_dc_state; From cc84a8e65d87990ebb09f65a745b38e95b4d2721 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 19 Jan 2021 20:45:06 +0000 Subject: [PATCH 363/506] dma-buf: system_heap: Make sure to return an error if we abort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we abort from the allocation due to a fatal_signal_pending(), be sure we report an error so any return code paths don't trip over the fact that the allocation didn't succeed. Cc: Sumit Semwal Cc: Liam Mark Cc: Laura Abbott Cc: Brian Starkey Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Daniel Mentz Cc: Chris Goldsworthy Cc: Ørjan Eide Cc: Robin Murphy Cc: Ezequiel Garcia Cc: Simon Ser Cc: James Jones Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Suggested-by: Suren Baghdasaryan Signed-off-by: John Stultz Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20210119204508.9256-1-john.stultz@linaro.org (cherry picked from commit 14a117252f57839bdf0123a1c888a96102e3a843) Signed-off-by: Sumit Semwal --- drivers/dma-buf/heaps/system_heap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index 17e0e9a68baf..405351aad2a8 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -363,8 +363,10 @@ static int system_heap_allocate(struct dma_heap *heap, * Avoid trying to allocate memory if the process * has been killed by SIGKILL */ - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current)) { + ret = -EINTR; goto free_buffer; + } page = alloc_largest_available(size_remaining, max_order); if (!page) From abf4451b340b09f797c87341b3010f95af9215c0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 19 Jan 2021 20:45:08 +0000 Subject: [PATCH 364/506] dma-buf: heaps: Rework heap allocation hooks to return struct dma_buf instead of fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every heap needs to create a dmabuf and then export it to a fd via dma_buf_fd(), so to consolidate things a bit, have the heaps just return a struct dmabuf * and let the top level dma_heap_buffer_alloc() call handle creating the fd via dma_buf_fd(). Cc: Sumit Semwal Cc: Liam Mark Cc: Laura Abbott Cc: Brian Starkey Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Daniel Mentz Cc: Chris Goldsworthy Cc: Ørjan Eide Cc: Robin Murphy Cc: Ezequiel Garcia Cc: Simon Ser Cc: James Jones Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: John Stultz Signed-off-by: Sumit Semwal [sumits: minor reword of commit message] Link: https://patchwork.freedesktop.org/patch/msgid/20210119204508.9256-3-john.stultz@linaro.org (cherry picked from commit c7f59e3dd60313071a989227dcb69094f499d310) Signed-off-by: Sumit Semwal --- drivers/dma-buf/dma-heap.c | 14 +++++++++++++- drivers/dma-buf/heaps/cma_heap.c | 22 +++++++--------------- drivers/dma-buf/heaps/system_heap.c | 21 +++++++-------------- include/linux/dma-heap.h | 12 ++++++------ 4 files changed, 33 insertions(+), 36 deletions(-) diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c index afd22c9dbdcf..6b5db954569f 100644 --- a/drivers/dma-buf/dma-heap.c +++ b/drivers/dma-buf/dma-heap.c @@ -52,6 +52,9 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, unsigned int fd_flags, unsigned int heap_flags) { + struct dma_buf *dmabuf; + int fd; + /* * Allocations from all heaps have to begin * and end on page boundaries. @@ -60,7 +63,16 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len, if (!len) return -EINVAL; - return heap->ops->allocate(heap, len, fd_flags, heap_flags); + dmabuf = heap->ops->allocate(heap, len, fd_flags, heap_flags); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + fd = dma_buf_fd(dmabuf, fd_flags); + if (fd < 0) { + dma_buf_put(dmabuf); + /* just return, as put will call release and that will free */ + } + return fd; } static int dma_heap_open(struct inode *inode, struct file *file) diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c index 364fc2f3e499..5d64eccd21d6 100644 --- a/drivers/dma-buf/heaps/cma_heap.c +++ b/drivers/dma-buf/heaps/cma_heap.c @@ -271,10 +271,10 @@ static const struct dma_buf_ops cma_heap_buf_ops = { .release = cma_heap_dma_buf_release, }; -static int cma_heap_allocate(struct dma_heap *heap, - unsigned long len, - unsigned long fd_flags, - unsigned long heap_flags) +static struct dma_buf *cma_heap_allocate(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags) { struct cma_heap *cma_heap = dma_heap_get_drvdata(heap); struct cma_heap_buffer *buffer; @@ -289,7 +289,7 @@ static int cma_heap_allocate(struct dma_heap *heap, buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&buffer->attachments); mutex_init(&buffer->lock); @@ -348,15 +348,7 @@ static int cma_heap_allocate(struct dma_heap *heap, ret = PTR_ERR(dmabuf); goto free_pages; } - - ret = dma_buf_fd(dmabuf, fd_flags); - if (ret < 0) { - dma_buf_put(dmabuf); - /* just return, as put will call release and that will free */ - return ret; - } - - return ret; + return dmabuf; free_pages: kfree(buffer->pages); @@ -365,7 +357,7 @@ static int cma_heap_allocate(struct dma_heap *heap, free_buffer: kfree(buffer); - return ret; + return ERR_PTR(ret); } static const struct dma_heap_ops cma_heap_ops = { diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index 405351aad2a8..29e49ac17251 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -331,10 +331,10 @@ static struct page *alloc_largest_available(unsigned long size, return NULL; } -static int system_heap_allocate(struct dma_heap *heap, - unsigned long len, - unsigned long fd_flags, - unsigned long heap_flags) +static struct dma_buf *system_heap_allocate(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags) { struct system_heap_buffer *buffer; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); @@ -349,7 +349,7 @@ static int system_heap_allocate(struct dma_heap *heap, buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&buffer->attachments); mutex_init(&buffer->lock); @@ -399,14 +399,7 @@ static int system_heap_allocate(struct dma_heap *heap, ret = PTR_ERR(dmabuf); goto free_pages; } - - ret = dma_buf_fd(dmabuf, fd_flags); - if (ret < 0) { - dma_buf_put(dmabuf); - /* just return, as put will call release and that will free */ - return ret; - } - return ret; + return dmabuf; free_pages: for_each_sgtable_sg(table, sg, i) { @@ -420,7 +413,7 @@ static int system_heap_allocate(struct dma_heap *heap, __free_pages(page, compound_order(page)); kfree(buffer); - return ret; + return ERR_PTR(ret); } static const struct dma_heap_ops system_heap_ops = { diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h index 454e354d1ffb..5bc5c946af58 100644 --- a/include/linux/dma-heap.h +++ b/include/linux/dma-heap.h @@ -16,15 +16,15 @@ struct dma_heap; /** * struct dma_heap_ops - ops to operate on a given heap - * @allocate: allocate dmabuf and return fd + * @allocate: allocate dmabuf and return struct dma_buf ptr * - * allocate returns dmabuf fd on success, -errno on error. + * allocate returns dmabuf on success, ERR_PTR(-errno) on error. */ struct dma_heap_ops { - int (*allocate)(struct dma_heap *heap, - unsigned long len, - unsigned long fd_flags, - unsigned long heap_flags); + struct dma_buf *(*allocate)(struct dma_heap *heap, + unsigned long len, + unsigned long fd_flags, + unsigned long heap_flags); }; /** From f588f0c69e0e645225e4ebc1aff8f9677583a056 Mon Sep 17 00:00:00 2001 From: Veera Sundaram Sankaran Date: Fri, 15 Jan 2021 16:31:46 -0800 Subject: [PATCH 365/506] dma-fence: allow signaling drivers to set fence timestamp Some drivers have hardware capability to get the precise HW timestamp of certain events based on which the fences are triggered. The delta between the event HW timestamp & current HW reference timestamp can be used to calculate the timestamp in kernel's CLOCK_MONOTONIC time domain. This allows it to set accurate timestamp factoring out any software and IRQ latencies. Add a timestamp variant of fence signal function, dma_fence_signal_timestamp to allow drivers to update the precise timestamp for fences. Changes in v2: - Add a new fence signal variant instead of modifying fence struct Changes in v3: - Add timestamp domain information to commit-text and dma_fence_signal_timestamp documentation Signed-off-by: Veera Sundaram Sankaran Reviewed-by: John Stultz Signed-off-by: Sumit Semwal [sumits: minor parenthesis alignment] Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-1-git-send-email-veeras@codeaurora.org (cherry picked from commit 5a164ac4dbd21b82bcdc03186d40e455ff467fdc) Signed-off-by: Sumit Semwal --- drivers/dma-buf/dma-fence.c | 102 ++++++++++++++++++++++++++++-------- include/linux/dma-fence.h | 3 ++ 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 7475e09b0680..d64fc03929be 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -311,6 +311,83 @@ void __dma_fence_might_wait(void) #endif +/** + * dma_fence_signal_timestamp_locked - signal completion of a fence + * @fence: the fence to signal + * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain + * + * Signal completion for software callbacks on a fence, this will unblock + * dma_fence_wait() calls and run all the callbacks added with + * dma_fence_add_callback(). Can be called multiple times, but since a fence + * can only go from the unsignaled to the signaled state and not back, it will + * only be effective the first time. Set the timestamp provided as the fence + * signal timestamp. + * + * Unlike dma_fence_signal_timestamp(), this function must be called with + * &dma_fence.lock held. + * + * Returns 0 on success and a negative error value when @fence has been + * signalled already. + */ +int dma_fence_signal_timestamp_locked(struct dma_fence *fence, + ktime_t timestamp) +{ + struct dma_fence_cb *cur, *tmp; + struct list_head cb_list; + + lockdep_assert_held(fence->lock); + + if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &fence->flags))) + return -EINVAL; + + /* Stash the cb_list before replacing it with the timestamp */ + list_replace(&fence->cb_list, &cb_list); + + fence->timestamp = timestamp; + set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); + trace_dma_fence_signaled(fence); + + list_for_each_entry_safe(cur, tmp, &cb_list, node) { + INIT_LIST_HEAD(&cur->node); + cur->func(fence, cur); + } + + return 0; +} +EXPORT_SYMBOL(dma_fence_signal_timestamp_locked); + +/** + * dma_fence_signal_timestamp - signal completion of a fence + * @fence: the fence to signal + * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain + * + * Signal completion for software callbacks on a fence, this will unblock + * dma_fence_wait() calls and run all the callbacks added with + * dma_fence_add_callback(). Can be called multiple times, but since a fence + * can only go from the unsignaled to the signaled state and not back, it will + * only be effective the first time. Set the timestamp provided as the fence + * signal timestamp. + * + * Returns 0 on success and a negative error value when @fence has been + * signalled already. + */ +int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp) +{ + unsigned long flags; + int ret; + + if (!fence) + return -EINVAL; + + spin_lock_irqsave(fence->lock, flags); + ret = dma_fence_signal_timestamp_locked(fence, timestamp); + spin_unlock_irqrestore(fence->lock, flags); + + return ret; +} +EXPORT_SYMBOL(dma_fence_signal_timestamp); + /** * dma_fence_signal_locked - signal completion of a fence * @fence: the fence to signal @@ -329,28 +406,7 @@ void __dma_fence_might_wait(void) */ int dma_fence_signal_locked(struct dma_fence *fence) { - struct dma_fence_cb *cur, *tmp; - struct list_head cb_list; - - lockdep_assert_held(fence->lock); - - if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, - &fence->flags))) - return -EINVAL; - - /* Stash the cb_list before replacing it with the timestamp */ - list_replace(&fence->cb_list, &cb_list); - - fence->timestamp = ktime_get(); - set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); - trace_dma_fence_signaled(fence); - - list_for_each_entry_safe(cur, tmp, &cb_list, node) { - INIT_LIST_HEAD(&cur->node); - cur->func(fence, cur); - } - - return 0; + return dma_fence_signal_timestamp_locked(fence, ktime_get()); } EXPORT_SYMBOL(dma_fence_signal_locked); @@ -379,7 +435,7 @@ int dma_fence_signal(struct dma_fence *fence) tmp = dma_fence_begin_signalling(); spin_lock_irqsave(fence->lock, flags); - ret = dma_fence_signal_locked(fence); + ret = dma_fence_signal_timestamp_locked(fence, ktime_get()); spin_unlock_irqrestore(fence->lock, flags); dma_fence_end_signalling(tmp); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 09e23adb351d..9f12efaaa93a 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -372,6 +372,9 @@ static inline void __dma_fence_might_wait(void) {} int dma_fence_signal(struct dma_fence *fence); int dma_fence_signal_locked(struct dma_fence *fence); +int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp); +int dma_fence_signal_timestamp_locked(struct dma_fence *fence, + ktime_t timestamp); signed long dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout); int dma_fence_add_callback(struct dma_fence *fence, From 583065c7aa23d4bb0c298222c1128353a2007c9c Mon Sep 17 00:00:00 2001 From: Veera Sundaram Sankaran Date: Fri, 15 Jan 2021 16:31:47 -0800 Subject: [PATCH 366/506] drm/drm_vblank: set the dma-fence timestamp during send_vblank_event The explicit out-fences in crtc are signaled as part of vblank event, indicating all framebuffers present on the Atomic Commit request are scanned out on the screen. Though the fence signal and the vblank event notification happens at the same time, triggered by the same hardware vsync event, the timestamp set in both are different. With drivers supporting precise vblank timestamp the difference between the two timestamps would be even higher. This might have an impact on use-mode frameworks using these fence timestamps for purposes other than simple buffer usage. For instance, the Android framework [1] uses the retire-fences as an alternative to vblank when frame-updates are in progress. Set the fence timestamp during send vblank event using a new drm_send_event_timestamp_locked variant to avoid discrepancies. [1] https://android.googlesource.com/platform/frameworks/native/+/master/ services/surfaceflinger/Scheduler/Scheduler.cpp#397 Changes in v2: - Use drm_send_event_timestamp_locked to update fence timestamp - add more information to commit text Changes in v3: - use same backend helper function for variants of drm_send_event to avoid code duplications Changes in v4: - remove WARN_ON from drm_send_event_timestamp_locked Signed-off-by: Veera Sundaram Sankaran Reviewed-by: John Stultz Signed-off-by: Sumit Semwal [sumits: minor parenthesis alignment correction] Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-2-git-send-email-veeras@codeaurora.org (cherry picked from commit a78e7a51d2fa9d2f482b462be4299784c884d988) Signed-off-by: Sumit Semwal --- drivers/gpu/drm/drm_file.c | 92 ++++++++++++++++++++++++++---------- drivers/gpu/drm/drm_vblank.c | 9 +++- include/drm/drm_file.h | 3 ++ 3 files changed, 79 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 6b116bfd747c..7efbccffc2ea 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -774,6 +774,72 @@ void drm_event_cancel_free(struct drm_device *dev, } EXPORT_SYMBOL(drm_event_cancel_free); +/** + * drm_send_event_helper - send DRM event to file descriptor + * @dev: DRM device + * @e: DRM event to deliver + * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC + * time domain + * + * This helper function sends the event @e, initialized with + * drm_event_reserve_init(), to its associated userspace DRM file. + * The timestamp variant of dma_fence_signal is used when the caller + * sends a valid timestamp. + */ +void drm_send_event_helper(struct drm_device *dev, + struct drm_pending_event *e, ktime_t timestamp) +{ + assert_spin_locked(&dev->event_lock); + + if (e->completion) { + complete_all(e->completion); + e->completion_release(e->completion); + e->completion = NULL; + } + + if (e->fence) { + if (timestamp) + dma_fence_signal_timestamp(e->fence, timestamp); + else + dma_fence_signal(e->fence); + dma_fence_put(e->fence); + } + + if (!e->file_priv) { + kfree(e); + return; + } + + list_del(&e->pending_link); + list_add_tail(&e->link, + &e->file_priv->event_list); + wake_up_interruptible_poll(&e->file_priv->event_wait, + EPOLLIN | EPOLLRDNORM); +} + +/** + * drm_send_event_timestamp_locked - send DRM event to file descriptor + * @dev: DRM device + * @e: DRM event to deliver + * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC + * time domain + * + * This function sends the event @e, initialized with drm_event_reserve_init(), + * to its associated userspace DRM file. Callers must already hold + * &drm_device.event_lock. + * + * Note that the core will take care of unlinking and disarming events when the + * corresponding DRM file is closed. Drivers need not worry about whether the + * DRM file for this event still exists and can call this function upon + * completion of the asynchronous work unconditionally. + */ +void drm_send_event_timestamp_locked(struct drm_device *dev, + struct drm_pending_event *e, ktime_t timestamp) +{ + drm_send_event_helper(dev, e, timestamp); +} +EXPORT_SYMBOL(drm_send_event_timestamp_locked); + /** * drm_send_event_locked - send DRM event to file descriptor * @dev: DRM device @@ -790,29 +856,7 @@ EXPORT_SYMBOL(drm_event_cancel_free); */ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e) { - assert_spin_locked(&dev->event_lock); - - if (e->completion) { - complete_all(e->completion); - e->completion_release(e->completion); - e->completion = NULL; - } - - if (e->fence) { - dma_fence_signal(e->fence); - dma_fence_put(e->fence); - } - - if (!e->file_priv) { - kfree(e); - return; - } - - list_del(&e->pending_link); - list_add_tail(&e->link, - &e->file_priv->event_list); - wake_up_interruptible_poll(&e->file_priv->event_wait, - EPOLLIN | EPOLLRDNORM); + drm_send_event_helper(dev, e, 0); } EXPORT_SYMBOL(drm_send_event_locked); @@ -836,7 +880,7 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) unsigned long irqflags; spin_lock_irqsave(&dev->event_lock, irqflags); - drm_send_event_locked(dev, e); + drm_send_event_helper(dev, e, 0); spin_unlock_irqrestore(&dev->event_lock, irqflags); } EXPORT_SYMBOL(drm_send_event); diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c index 30912d8f82a5..893165eeddf3 100644 --- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -1006,7 +1006,14 @@ static void send_vblank_event(struct drm_device *dev, break; } trace_drm_vblank_event_delivered(e->base.file_priv, e->pipe, seq); - drm_send_event_locked(dev, &e->base); + /* + * Use the same timestamp for any associated fence signal to avoid + * mismatch in timestamps for vsync & fence events triggered by the + * same HW event. Frameworks like SurfaceFlinger in Android expects the + * retire-fence timestamp to match exactly with HW vsync as it uses it + * for its software vsync modeling. + */ + drm_send_event_timestamp_locked(dev, &e->base, now); } /** diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h index 716990bace10..b81b3bfb08c8 100644 --- a/include/drm/drm_file.h +++ b/include/drm/drm_file.h @@ -399,6 +399,9 @@ void drm_event_cancel_free(struct drm_device *dev, struct drm_pending_event *p); void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e); void drm_send_event(struct drm_device *dev, struct drm_pending_event *e); +void drm_send_event_timestamp_locked(struct drm_device *dev, + struct drm_pending_event *e, + ktime_t timestamp); struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags); From 88eee9b7b42e69fb622ddb3ff6f37e8e4347f5b2 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Tue, 23 Feb 2021 19:34:56 +0100 Subject: [PATCH 367/506] net: usb: qmi_wwan: support ZTE P685M modem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that interface 3 in "option" driver is no longer mapped, add device ID matching it to qmi_wwan. The modem is used inside ZTE MF283+ router and carriers identify it as such. Interface mapping is: 0: QCDM, 1: AT (PCUI), 2: AT (Modem), 3: QMI, 4: ADB T: Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=1275 Rev=f0.00 S: Manufacturer=ZTE,Incorporated S: Product=ZTE Technologies MSM S: SerialNumber=P685M510ZTED0000CP&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&0 C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=87(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Acked-by: Bjørn Mork Signed-off-by: Lech Perczak Link: https://lore.kernel.org/r/20210223183456.6377-1-lech.perczak@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6c3d8c2abd38..17a050521b86 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1318,6 +1318,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1255, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1256, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1270, 5)}, /* ZTE MF667 */ + {QMI_FIXED_INTF(0x19d2, 0x1275, 3)}, /* ZTE P685M */ {QMI_FIXED_INTF(0x19d2, 0x1401, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1402, 2)}, /* ZTE MF60 */ {QMI_FIXED_INTF(0x19d2, 0x1424, 2)}, From 4e096a18867a5a989b510f6999d9c6b6622e8f7b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 23 Feb 2021 08:01:26 +0100 Subject: [PATCH 368/506] net: introduce CAN specific pointer in the struct net_device Since 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv") the CAN framework uses per device specific data in the AF_CAN protocol. For this purpose the struct net_device->ml_priv is used. Later the ml_priv usage in CAN was extended for other users, one of them being CAN_J1939. Later in the kernel ml_priv was converted to an union, used by other drivers. E.g. the tun driver started storing it's stats pointer. Since tun devices can claim to be a CAN device, CAN specific protocols will wrongly interpret this pointer, which will cause system crashes. Mostly this issue is visible in the CAN_J1939 stack. To fix this issue, we request a dedicated CAN pointer within the net_device struct. Reported-by: syzbot+5138c4dd15a0401bec7b@syzkaller.appspotmail.com Fixes: 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv") Fixes: ffd956eef69b ("can: introduce CAN midlayer private and allocate it automatically") Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 497a5757ce4e ("tun: switch to net core provided statistics counters") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20210223070127.4538-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/can/dev/dev.c | 4 +++- drivers/net/can/slcan.c | 4 +++- drivers/net/can/vcan.c | 2 +- drivers/net/can/vxcan.c | 6 +++++- include/linux/can/can-ml.h | 12 ++++++++++++ include/linux/netdevice.h | 34 +++++++++++++++++++++++++++++++++- net/can/af_can.c | 34 ++-------------------------------- net/can/j1939/main.c | 22 ++++++++-------------- net/can/j1939/socket.c | 13 ++++--------- net/can/proc.c | 19 +++++++++++++------ 10 files changed, 84 insertions(+), 66 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index d9281ae853f8..311d8564d611 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -239,6 +239,7 @@ void can_setup(struct net_device *dev) struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs) { + struct can_ml_priv *can_ml; struct net_device *dev; struct can_priv *priv; int size; @@ -270,7 +271,8 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, priv = netdev_priv(dev); priv->dev = dev; - dev->ml_priv = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN); + can_ml = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); if (echo_skb_max) { priv->echo_skb_max = echo_skb_max; diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index a1bd1be09548..30c8d53c9745 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -516,6 +516,7 @@ static struct slcan *slc_alloc(void) int i; char name[IFNAMSIZ]; struct net_device *dev = NULL; + struct can_ml_priv *can_ml; struct slcan *sl; int size; @@ -538,7 +539,8 @@ static struct slcan *slc_alloc(void) dev->base_addr = i; sl = netdev_priv(dev); - dev->ml_priv = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN); + can_ml = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); /* Initialize channel control data */ sl->magic = SLCAN_MAGIC; diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 39ca14b0585d..067705e2850b 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -153,7 +153,7 @@ static void vcan_setup(struct net_device *dev) dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; - dev->ml_priv = netdev_priv(dev); + can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index f9a524c5f6d6..8861a7d875e7 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -141,6 +141,8 @@ static const struct net_device_ops vxcan_netdev_ops = { static void vxcan_setup(struct net_device *dev) { + struct can_ml_priv *can_ml; + dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; @@ -149,7 +151,9 @@ static void vxcan_setup(struct net_device *dev) dev->flags = (IFF_NOARP|IFF_ECHO); dev->netdev_ops = &vxcan_netdev_ops; dev->needs_free_netdev = true; - dev->ml_priv = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); + + can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); + can_set_ml_priv(dev, can_ml); } /* forward declaration for rtnl_create_link() */ diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h index 2f5d731ae251..8afa92d15a66 100644 --- a/include/linux/can/can-ml.h +++ b/include/linux/can/can-ml.h @@ -44,6 +44,7 @@ #include #include +#include #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 @@ -65,4 +66,15 @@ struct can_ml_priv { #endif }; +static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) +{ + return netdev_get_ml_priv(dev, ML_PRIV_CAN); +} + +static inline void can_set_ml_priv(struct net_device *dev, + struct can_ml_priv *ml_priv) +{ + netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); +} + #endif /* CAN_ML_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddf4cfc12615..f06fbee8638e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1584,6 +1584,12 @@ enum netdev_priv_flags { #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +/* Specifies the type of the struct net_device::ml_priv pointer */ +enum netdev_ml_priv_type { + ML_PRIV_NONE, + ML_PRIV_CAN, +}; + /** * struct net_device - The DEVICE structure. * @@ -1779,6 +1785,7 @@ enum netdev_priv_flags { * @nd_net: Network namespace this network device is inside * * @ml_priv: Mid-layer private + * @ml_priv_type: Mid-layer private type * @lstats: Loopback statistics * @tstats: Tunnel statistics * @dstats: Dummy statistics @@ -2094,8 +2101,10 @@ struct net_device { possible_net_t nd_net; /* mid-layer private */ + void *ml_priv; + enum netdev_ml_priv_type ml_priv_type; + union { - void *ml_priv; struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; @@ -2286,6 +2295,29 @@ static inline void netdev_reset_rx_headroom(struct net_device *dev) netdev_set_rx_headroom(dev, -1); } +static inline void *netdev_get_ml_priv(struct net_device *dev, + enum netdev_ml_priv_type type) +{ + if (dev->ml_priv_type != type) + return NULL; + + return dev->ml_priv; +} + +static inline void netdev_set_ml_priv(struct net_device *dev, + void *ml_priv, + enum netdev_ml_priv_type type) +{ + WARN(dev->ml_priv_type && dev->ml_priv_type != type, + "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n", + dev->ml_priv_type, type); + WARN(!dev->ml_priv_type && dev->ml_priv, + "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n"); + + dev->ml_priv = ml_priv; + dev->ml_priv_type = type; +} + /* * Net namespace inlines */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 837bb8af0ec3..cce2af10eb3e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -304,8 +304,8 @@ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { - struct can_ml_priv *ml_priv = dev->ml_priv; - return &ml_priv->dev_rcv_lists; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } @@ -790,25 +790,6 @@ void can_proto_unregister(const struct can_proto *cp) } EXPORT_SYMBOL(can_proto_unregister); -/* af_can notifier to create/remove CAN netdevice specific structs */ -static int can_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; - - switch (msg) { - case NETDEV_REGISTER: - WARN(!dev->ml_priv, - "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); - break; - } - - return NOTIFY_DONE; -} - static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); @@ -876,11 +857,6 @@ static const struct net_proto_family can_family_ops = { .owner = THIS_MODULE, }; -/* notifier block for netdevice event */ -static struct notifier_block can_netdev_notifier __read_mostly = { - .notifier_call = can_notifier, -}; - static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, @@ -911,17 +887,12 @@ static __init int can_init(void) err = sock_register(&can_family_ops); if (err) goto out_sock; - err = register_netdevice_notifier(&can_netdev_notifier); - if (err) - goto out_notifier; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); return 0; -out_notifier: - sock_unregister(PF_CAN); out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: @@ -935,7 +906,6 @@ static __exit void can_exit(void) /* protocol unregister */ dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); - unregister_netdevice_notifier(&can_netdev_notifier); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index bb914d8b4216..da3a7a7bcff2 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -140,9 +140,9 @@ static struct j1939_priv *j1939_priv_create(struct net_device *ndev) static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { - struct can_ml_priv *can_ml_priv = ndev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); - can_ml_priv->j1939_priv = priv; + can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) @@ -211,12 +211,9 @@ static void __j1939_rx_release(struct kref *kref) /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { - struct can_ml_priv *can_ml_priv = ndev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); - if (!can_ml_priv) - return NULL; - - return can_ml_priv->j1939_priv; + return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) @@ -225,9 +222,6 @@ static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) lockdep_assert_held(&j1939_netdev_lock); - if (ndev->type != ARPHRD_CAN) - return NULL; - priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); @@ -348,15 +342,16 @@ static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); + struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; + if (!can_ml) + goto notify_done; + priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; - if (ndev->type != ARPHRD_CAN) - goto notify_put; - switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); @@ -365,7 +360,6 @@ static int j1939_netdev_notify(struct notifier_block *nb, break; } -notify_put: j1939_priv_put(priv); notify_done: diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f23966526a88..56aa66147d5a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -453,6 +454,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { + struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); @@ -461,15 +463,8 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out_release_sock; } - if (ndev->type != ARPHRD_CAN) { - dev_put(ndev); - ret = -ENODEV; - goto out_release_sock; - } - - if (!ndev->ml_priv) { - netdev_warn_once(ndev, - "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); + can_ml = can_get_ml_priv(ndev); + if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; diff --git a/net/can/proc.c b/net/can/proc.c index 5ea8695f507e..b15760b5c1cc 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -322,8 +322,11 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) /* receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) - can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv); + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) + can_rcvlist_proc_show_one(m, idx, dev, + &can_ml->dev_rcv_lists); } rcu_read_unlock(); @@ -375,8 +378,10 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) /* sff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - dev_rcv_lists = dev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) { + dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); } @@ -406,8 +411,10 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) /* eff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - dev_rcv_lists = dev->ml_priv; + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (can_ml) { + dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); } From 17d7fd47aa9063c2ff36988e36757ac345733e28 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 23 Feb 2021 10:48:03 +0000 Subject: [PATCH 369/506] net: stmmac: Fix missing spin_lock_init in visconti_eth_dwmac_probe() The driver allocates the spinlock but not initialize it. Use spin_lock_init() on it to initialize it correctly. Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Acked-by: Nobuhiro Iwamatsu Link: https://lore.kernel.org/r/20210223104803.4047281-1-weiyongjun1@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index b7a0c57dfbfb..d23be45a64e5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -218,6 +218,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) goto remove_config; } + spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; From 8f1c0fd2c84c8bf738b7139d09d4ea53027f47c3 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 23 Feb 2021 21:02:29 -0800 Subject: [PATCH 370/506] ibmvnic: fix a race between open and reset __ibmvnic_reset() currently reads the adapter->state before getting the rtnl and saves that state as the "target state" for the reset. If this read occurs when adapter is in PROBED state, the target state would be PROBED. Just after the target state is saved, and before the actual reset process is started (i.e before rtnl is acquired) if we get an ibmvnic_open() call we would move the adapter to OPEN state. But when the reset is processed (after ibmvnic_open()) drops the rtnl), it will leave the adapter in PROBED state even though we already moved it to OPEN. To fix this, use the RTNL to improve serialization when reading/updating the adapter state. i.e determine the target state of a reset only after getting the RTNL. And if a reset is in progress during an open, simply set the target state of the adapter and let the reset code finish the open (like we currently do if failover is pending). One twist to this serialization is if the adapter state changes when we drop the RTNL to update the link state. Account for this by checking if there was an intervening open and update the target state for the reset accordingly (see new comments in the code). Note that only the reset functions and ibmvnic_open() can set the adapter to OPEN state and this must happen under rtnl. Fixes: 7d7195a026ba ("ibmvnic: Do not process device remove during device reset") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Link: https://lore.kernel.org/r/20210224050229.1155468-1-sukadev@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 63 ++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1c0e4beb56e7..118a4bd3f877 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1172,12 +1172,25 @@ static int ibmvnic_open(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; - /* If device failover is pending, just set device state and return. - * Device operation will be handled by reset routine. + ASSERT_RTNL(); + + /* If device failover is pending or we are about to reset, just set + * device state and return. Device operation will be handled by reset + * routine. + * + * It should be safe to overwrite the adapter->state here. Since + * we hold the rtnl, either the reset has not actually started or + * the rtnl got dropped during the set_link_state() in do_reset(). + * In the former case, no one else is changing the state (again we + * have the rtnl) and in the latter case, do_reset() will detect and + * honor our setting below. */ - if (adapter->failover_pending) { + if (adapter->failover_pending || (test_bit(0, &adapter->resetting))) { + netdev_dbg(netdev, "[S:%d FOP:%d] Resetting, deferring open\n", + adapter->state, adapter->failover_pending); adapter->state = VNIC_OPEN; - return 0; + rc = 0; + goto out; } if (adapter->state != VNIC_CLOSED) { @@ -1196,10 +1209,12 @@ static int ibmvnic_open(struct net_device *netdev) rc = __ibmvnic_open(netdev); out: - /* If open fails due to a pending failover, set device state and - * return. Device operation will be handled by reset routine. + /* If open failed and there is a pending failover or in-progress reset, + * set device state and return. Device operation will be handled by + * reset routine. See also comments above regarding rtnl. */ - if (rc && adapter->failover_pending) { + if (rc && + (adapter->failover_pending || (test_bit(0, &adapter->resetting)))) { adapter->state = VNIC_OPEN; rc = 0; } @@ -1928,6 +1943,14 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rwi->reset_reason == VNIC_RESET_FAILOVER) adapter->failover_pending = false; + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; + } + netif_carrier_off(netdev); old_num_rx_queues = adapter->req_rx_queues; @@ -1958,11 +1981,27 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rc) goto out; + if (adapter->state == VNIC_OPEN) { + /* When we dropped rtnl, ibmvnic_open() got + * it and noticed that we are resetting and + * set the adapter state to OPEN. Update our + * new "target" state, and resume the reset + * from VNIC_CLOSING state. + */ + netdev_dbg(netdev, + "Open changed state from %d, updating.\n", + reset_state); + reset_state = VNIC_OPEN; + adapter->state = VNIC_CLOSING; + } + if (adapter->state != VNIC_CLOSING) { + /* If someone else changed the adapter state + * when we dropped the rtnl, fail the reset + */ rc = -1; goto out; } - adapter->state = VNIC_CLOSED; } } @@ -2106,6 +2145,14 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, netdev_dbg(adapter->netdev, "Hard resetting driver (%d)\n", rwi->reset_reason); + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; + } + netif_carrier_off(netdev); adapter->reset_reason = rwi->reset_reason; From 7a8a4b0729a8807e37196e44629b31ee03f88872 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Fri, 19 Feb 2021 12:25:47 +0800 Subject: [PATCH 371/506] drm/ttm: Fix a memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Free the memory on failure. Also no need to re-alloc memory on retry. Signed-off-by: xinhui pan Link: https://patchwork.freedesktop.org/patch/msgid/20210219042547.44855-1-xinhui.pan@amd.com Reviewed-by: Christian König CC: stable@vger.kernel.org # 5.11 Signed-off-by: Christian König Signed-off-by: Maarten Lankhorst --- drivers/gpu/drm/ttm/ttm_bo.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index b65f4b12f986..20a25660b35b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -959,8 +959,10 @@ static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo, return ret; /* move to the bounce domain */ ret = ttm_bo_handle_move_mem(bo, &hop_mem, false, ctx, NULL); - if (ret) + if (ret) { + ttm_resource_free(bo, &hop_mem); return ret; + } return 0; } @@ -991,18 +993,19 @@ static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, * stop and the driver will be called to make * the second hop. */ -bounce: ret = ttm_bo_mem_space(bo, placement, &mem, ctx); if (ret) return ret; +bounce: ret = ttm_bo_handle_move_mem(bo, &mem, false, ctx, &hop); if (ret == -EMULTIHOP) { ret = ttm_bo_bounce_temp_buffer(bo, &mem, ctx, &hop); if (ret) - return ret; + goto out; /* try and move to final place now. */ goto bounce; } +out: if (ret) ttm_resource_free(bo, &mem); return ret; From 5f94e3571459abb626077aedb65d71264c2a58c0 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 11 Aug 2020 16:26:31 -0400 Subject: [PATCH 372/506] drm/rockchip: Require the YTR modifier for AFBC The AFBC decoder used in the Rockchip VOP assumes the use of the YUV-like colourspace transform (YTR). YTR is lossless for RGB(A) buffers, which covers the RGBA8 and RGB565 formats supported in vop_convert_afbc_format. Use of YTR is signaled with the AFBC_FORMAT_MOD_YTR modifier, which prior to this commit was missing. As such, a producer would have to generate buffers that do not use YTR, which the VOP would erroneously decode as YTR, leading to severe visual corruption. The upstream AFBC support was developed against a captured frame, which failed to exercise modifier support. Prior to bring-up of AFBC in Mesa (in the Panfrost driver), no open userspace respected modifier reporting. As such, this change is not expected to affect broken userspaces. Tested on RK3399 with Panfrost and Weston. Fixes: 7707f7227f09 ("drm/rockchip: Add support for afbc") Cc: stable@vger.kernel.org Signed-off-by: Alyssa Rosenzweig Acked-by: Daniel Stone Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20200811202631.3603-1-alyssa.rosenzweig@collabora.com Signed-off-by: Maarten Lankhorst --- drivers/gpu/drm/rockchip/rockchip_drm_vop.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.h b/drivers/gpu/drm/rockchip/rockchip_drm_vop.h index 4a2099cb582e..857d97cdc67c 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.h +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.h @@ -17,9 +17,20 @@ #define NUM_YUV2YUV_COEFFICIENTS 12 +/* AFBC supports a number of configurable modes. Relevant to us is block size + * (16x16 or 32x8), storage modifiers (SPARSE, SPLIT), and the YUV-like + * colourspace transform (YTR). 16x16 SPARSE mode is always used. SPLIT mode + * could be enabled via the hreg_block_split register, but is not currently + * handled. The colourspace transform is implicitly always assumed by the + * decoder, so consumers must use this transform as well. + * + * Failure to match modifiers will cause errors displaying AFBC buffers + * produced by conformant AFBC producers, including Mesa. + */ #define ROCKCHIP_AFBC_MOD \ DRM_FORMAT_MOD_ARM_AFBC( \ AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 | AFBC_FORMAT_MOD_SPARSE \ + | AFBC_FORMAT_MOD_YTR \ ) enum vop_data_format { From d922d58fedcd98ba625e89b625a98e222b090b10 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sat, 6 Feb 2021 14:50:20 +0100 Subject: [PATCH 373/506] drm/panel: kd35t133: allow using non-continuous dsi clock The panel is able to work when dsi clock is non-continuous, thus the system power consumption can be reduced using such feature. Add MIPI_DSI_CLOCK_NON_CONTINUOUS to panel's mode_flags. Also the flag actually becomes necessary after commit c6d94e37bdbb ("drm/bridge/synopsys: dsi: add support for non-continuous HS clock") and without it the panel only emits stripes instead of output. Fixes: c6d94e37bdbb ("drm/bridge/synopsys: dsi: add support for non-continuous HS clock") Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Heiko Stuebner Tested-by: Ezequiel Garcia Reviewed-by: Christopher Morgan Link: https://patchwork.freedesktop.org/patch/msgid/20210206135020.1991820-1-heiko@sntech.de Signed-off-by: Maarten Lankhorst --- drivers/gpu/drm/panel/panel-elida-kd35t133.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-elida-kd35t133.c b/drivers/gpu/drm/panel/panel-elida-kd35t133.c index bc36aa3c1123..fe5ac3ef9018 100644 --- a/drivers/gpu/drm/panel/panel-elida-kd35t133.c +++ b/drivers/gpu/drm/panel/panel-elida-kd35t133.c @@ -265,7 +265,8 @@ static int kd35t133_probe(struct mipi_dsi_device *dsi) dsi->lanes = 1; dsi->format = MIPI_DSI_FMT_RGB888; dsi->mode_flags = MIPI_DSI_MODE_VIDEO | MIPI_DSI_MODE_VIDEO_BURST | - MIPI_DSI_MODE_LPM | MIPI_DSI_MODE_EOT_PACKET; + MIPI_DSI_MODE_LPM | MIPI_DSI_MODE_EOT_PACKET | + MIPI_DSI_CLOCK_NON_CONTINUOUS; drm_panel_init(&ctx->panel, &dsi->dev, &kd35t133_funcs, DRM_MODE_CONNECTOR_DSI); From 2df8d3807ce7f75bb975f1aeae8fc6757527c62d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Feb 2021 16:56:26 -0800 Subject: [PATCH 374/506] KVM: SVM: Fix nested VM-Exit on #GP interception handling Fix the interpreation of nested_svm_vmexit()'s return value when synthesizing a nested VM-Exit after intercepting an SVM instruction while L2 was running. The helper returns '0' on success, whereas a return value of '0' in the exit handler path means "exit to userspace". The incorrect return value causes KVM to exit to userspace without filling the run state, e.g. QEMU logs "KVM: unknown exit, hardware reason 0". Fixes: 14c2bf81fcd2 ("KVM: SVM: Fix #GP handling for doubly-nested virtualization") Signed-off-by: Sean Christopherson Message-Id: <20210224005627.657028-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e4cc081ad7c1..c636021b066b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2211,15 +2211,20 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) [SVM_INSTR_VMSAVE] = vmsave_interception, }; struct vcpu_svm *svm = to_svm(vcpu); + int ret; if (is_guest_mode(vcpu)) { svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - return nested_svm_vmexit(svm); - } else - return svm_instr_handlers[opcode](svm); + /* Returns '1' or -errno on failure, '0' on success. */ + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + return 1; + } + return svm_instr_handlers[opcode](svm); } /* From df84fe94708985cdfb78a83148322bcd0a699472 Mon Sep 17 00:00:00 2001 From: Timothy E Baldwin Date: Sat, 16 Jan 2021 15:18:54 +0000 Subject: [PATCH 375/506] arm64: ptrace: Fix seccomp of traced syscall -1 (NO_SYSCALL) Since commit f086f67485c5 ("arm64: ptrace: add support for syscall emulation"), if system call number -1 is called and the process is being traced with PTRACE_SYSCALL, for example by strace, the seccomp check is skipped and -ENOSYS is returned unconditionally (unless altered by the tracer) rather than carrying out action specified in the seccomp filter. The consequence of this is that it is not possible to reliably strace a seccomp based implementation of a foreign system call interface in which r7/x8 is permitted to be -1 on entry to a system call. Also trace_sys_enter and audit_syscall_entry are skipped if a system call is skipped. Fix by removing the in_syscall(regs) check restoring the previous behaviour which is like AArch32, x86 (which uses generic code) and everything else. Cc: Oleg Nesterov Cc: Catalin Marinas Cc: Fixes: f086f67485c5 ("arm64: ptrace: add support for syscall emulation") Reviewed-by: Kees Cook Reviewed-by: Sudeep Holla Tested-by: Sudeep Holla Signed-off-by: Timothy E Baldwin Link: https://lore.kernel.org/r/90edd33b-6353-1228-791f-0336d94d5f8c@majoroak.me.uk Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 3d5c8afca75b..170f42fd6101 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1797,7 +1797,7 @@ int syscall_trace_enter(struct pt_regs *regs) if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); - if (!in_syscall(regs) || (flags & _TIF_SYSCALL_EMU)) + if (flags & _TIF_SYSCALL_EMU) return NO_SYSCALL; } From 3c02600144bdb0a1280a9090d3a7e37e2f9fdcc8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 24 Feb 2021 16:50:37 +0000 Subject: [PATCH 376/506] arm64: stacktrace: Report when we reach the end of the stack Currently the arm64 unwinder code returns -EINVAL whenever it can't find the next stack frame, not distinguishing between cases where the stack has been corrupted or is otherwise in a state it shouldn't be and cases where we have reached the end of the stack. At the minute none of the callers care what error code is returned but this will be important for reliable stack trace which needs to be sure that the stack is intact. Change to return -ENOENT in the case where we reach the bottom of the stack. The error codes from this function are only used in kernel, this particular code is chosen as we are indicating that we know there is no frame there. Signed-off-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210224165037.24138-1-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 0fb42129b469..ad20981dfda4 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -46,7 +46,7 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) /* Terminal record; nothing to unwind */ if (!fp) - return -EINVAL; + return -ENOENT; if (fp & 0xf) return -EINVAL; From fcd4ba3bcba78a97a0f8bdb5df37bc74820f9a62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 23 Feb 2021 12:20:03 +0100 Subject: [PATCH 377/506] net: dsa: sja1105: Remove unneeded cast in sja1105_crc32() sja1105_unpack() takes a "const void *buf" as its first parameter, so there is no need to cast away the "const" of the "buf" variable before calling it. Drop the cast, as it prevents the compiler performing some checks. Signed-off-by: Geert Uytterhoeven Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210223112003.2223332-1-geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_static_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 139b7b4fbd0d..a8efb7fac395 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -85,7 +85,7 @@ u32 sja1105_crc32(const void *buf, size_t len) /* seed */ crc = ~0; for (i = 0; i < len; i += 4) { - sja1105_unpack((void *)buf + i, &word, 31, 0, 4); + sja1105_unpack(buf + i, &word, 31, 0, 4); crc = crc32_le(crc, (u8 *)&word, 4); } return ~crc; From f176411401127a07a9360dec14eca448eb2e9d45 Mon Sep 17 00:00:00 2001 From: Marco Wenzel Date: Wed, 24 Feb 2021 10:46:49 +0100 Subject: [PATCH 378/506] net: hsr: add support for EntryForgetTime In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a node does not send any frame within this time, the sequence number check for can be ignored. This solves communication issues with Cisco IE 2000 in Redbox mode. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Marco Wenzel Reviewed-by: George McCollister Tested-by: George McCollister Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210224094653.1440-1-marco.wenzel@a-eberle.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 9 +++++++-- net/hsr/hsr_framereg.h | 1 + net/hsr/hsr_main.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index f9a8cc82ae2e..bb1351c38397 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; - for (i = 0; i < HSR_PT_PORTS; i++) + for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; + new_node->time_out[i] = now; + } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; @@ -413,9 +415,12 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])) + if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && + time_is_after_jiffies(node->time_out[port->type] + + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) return 1; + node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; return 0; } diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 86b43f539f2c..d9628e7a5f05 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -75,6 +75,7 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; + unsigned long time_out[HSR_PT_PORTS]; /* if the node is a SAN */ bool san_a; bool san_b; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index a169808ee78a..8f264672b70b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -22,6 +22,7 @@ #define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */ #define HSR_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ +#define HSR_ENTRY_FORGET_TIME 400 /* ms */ /* By how much may slave1 and slave2 timestamps of latest received frame from * each node differ before we notify of communication problem? From 4dc7f09b8becfa35a55430a49d95acf19f996e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 24 Feb 2021 16:18:41 +0100 Subject: [PATCH 379/506] net: broadcom: bcm4908_enet: fix RX path possible mem leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After filling RX ring slot with new skb it's required to free old skb. Immediately on error or later in the net subsystem. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210224151842.2419-1-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 9be33dc98072..7983c7a9fca9 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -570,6 +570,7 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) if (len < ETH_ZLEN || (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) { + kfree_skb(slot.skb); enet->netdev->stats.rx_dropped++; break; } From 4d9274cee40b6a20dd6148c6c81c6733c2678cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 24 Feb 2021 16:18:42 +0100 Subject: [PATCH 380/506] net: broadcom: bcm4908_enet: fix NAPI poll returned value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing increment was resulting in poll function always returning 0 instead of amount of processed packets. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210224151842.2419-2-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 7983c7a9fca9..0b70e9e0ddad 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -583,6 +583,8 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) enet->netdev->stats.rx_packets++; enet->netdev->stats.rx_bytes += len; + + handled++; } if (handled < weight) { From a93dcaada2ddb58dbc72652b42548adedd646d7a Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Thu, 25 Feb 2021 15:51:45 +0800 Subject: [PATCH 381/506] net: psample: Fix netlink skb length with tunnel info Currently, the psample netlink skb is allocated with a size that does not account for the nested 'PSAMPLE_ATTR_TUNNEL' attribute and the padding required for the 64-bit attribute 'PSAMPLE_TUNNEL_KEY_ATTR_ID'. This can result in failure to add attributes to the netlink skb due to insufficient tail room. The following error message is printed to the kernel log: "Could not create psample log message". Fix this by adjusting the allocation size to take into account the nested attribute and the padding. Fixes: d8bed686ab96 ("net: psample: Add tunnel support") CC: Yotam Gigi Reviewed-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Chris Mi Link: https://lore.kernel.org/r/20210225075145.184314-1-cmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index 33e238c965bd..482c07f2766b 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -309,10 +309,10 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) unsigned short tun_proto = ip_tunnel_info_af(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; - int sum = 0; + int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ if (tun_key->tun_flags & TUNNEL_KEY) - sum += nla_total_size(sizeof(u64)); + sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) sum += nla_total_size(0); From d814567942ff6ac73869052bdb8ca911364e5eb0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 25 Feb 2021 12:49:58 -0500 Subject: [PATCH 382/506] mm, tracing: Fix kmem_cache_free trace event to not print stale pointers The update to kmem_cache_free trace event added printing of the slab name in the trace event. But it only stores the pointer of the name which will be printed as a string when the event is read some time in the future. This is dangerous because the name could be freed in the mean time and when reading the trace event it would try to dereference the string name by the pointer to the name that has been freed. Instead, use the trace event helper macros __string(), __assign_str(), and __get_str() that are for this very case. Cc: Jacob Wen Fixes: 3544de8ee6e4 ("mm, tracing: record slab name for kmem_cache_free()") Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/kmem.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 40845b0d5dad..3a60b6b6db32 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -144,17 +144,17 @@ TRACE_EVENT(kmem_cache_free, TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) - __field( const char *, name ) + __string( name, name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __entry->name = name; + __assign_str(name, name); ), TP_printk("call_site=%pS ptr=%p name=%s", - (void *)__entry->call_site, __entry->ptr, __entry->name) + (void *)__entry->call_site, __entry->ptr, __get_str(name)) ); TRACE_EVENT(mm_page_free, From 764d31cacfe48440745c4bbb55a62ac9471c9f19 Mon Sep 17 00:00:00 2001 From: Christian Melki Date: Wed, 24 Feb 2021 21:55:36 +0100 Subject: [PATCH 383/506] net: phy: micrel: set soft_reset callback to genphy_soft_reset for KSZ8081 Following a similar reinstate for the KSZ9031. Older kernels would use the genphy_soft_reset if the PHY did not implement a .soft_reset. Bluntly removing that default may expose a lot of situations where various PHYs/board implementations won't recover on various changes. Like with this implementation during a 4.9.x to 5.4.x LTS transition. I think it's a good thing to remove unwanted soft resets but wonder if it did open a can of worms? Atleast this fixes one iMX6 FEC/RMII/8081 combo. Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Signed-off-by: Christian Melki Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210224205536.9349-1-christian.melki@t2data.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 7ec6f70d6a82..a14a00328fa3 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1303,6 +1303,7 @@ static struct phy_driver ksphy_driver[] = { .driver_data = &ksz8081_type, .probe = kszphy_probe, .config_init = ksz8081_config_init, + .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .get_sset_count = kszphy_get_sset_count, From c1d96fa61eb74b1e211f1653acc5b68ac62c8ef4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 25 Feb 2021 17:52:48 +0100 Subject: [PATCH 384/506] tracing/tools: fix a couple of spelling mistakes There is a spelling mistake in the -g help option, I believe it should be "graph". There is also a spelling mistake in a warning message. Fix both mistakes. Link: https://lkml.kernel.org/r/20210225165248.22050-2-Viktor.Rosendahl@bmw.de Signed-off-by: Colin Ian King Signed-off-by: Viktor Rosendahl Signed-off-by: Steven Rostedt (VMware) --- tools/tracing/latency/latency-collector.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 57b20802e71b..b69de9263ee6 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1650,7 +1650,7 @@ static void start_printthread(void) if (ufd < 0 || read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) { printf( -"Warning! Using trivial random nummer seed, since %s not available\n", +"Warning! Using trivial random number seed, since %s not available\n", DEV_URANDOM); fflush(stdout); *seed = i; @@ -1711,8 +1711,8 @@ static void show_usage(void) "\t\t\tbeginning, end, and backtrace.\n\n" "-g, --graph\t\tEnable the display-graph option in trace_option. This\n" -"\t\t\toption causes ftrace to show the functionph of how\n" -"\t\t\tfunctions are calling other functions.\n\n" +"\t\t\toption causes ftrace to show the graph of how functions\n" +"\t\t\tare calling other functions.\n\n" "-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n" "\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n" From 6cf739131a15e4177e58a1b4f2bede9d5da78552 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 25 Feb 2021 16:05:19 +0100 Subject: [PATCH 385/506] r8169: fix jumbo packet handling on RTL8168e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Josef reported [0] that using jumbo packets fails on RTL8168e. Aligning the values for register MaxTxPacketSize with the vendor driver fixes the problem. [0] https://bugzilla.kernel.org/show_bug.cgi?id=211827 Fixes: d58d46b5d851 ("r8169: jumbo fixes.") Reported-by: Josef Oškera Tested-by: Josef Oškera Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/b15ddef7-0d50-4320-18f4-6a3f86fbfd3e@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0a20dae32184..f704da3f214c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2285,14 +2285,14 @@ static void r8168dp_hw_jumbo_disable(struct rtl8169_private *tp) static void r8168e_hw_jumbo_enable(struct rtl8169_private *tp) { - RTL_W8(tp, MaxTxPacketSize, 0x3f); + RTL_W8(tp, MaxTxPacketSize, 0x24); RTL_W8(tp, Config3, RTL_R8(tp, Config3) | Jumbo_En0); RTL_W8(tp, Config4, RTL_R8(tp, Config4) | 0x01); } static void r8168e_hw_jumbo_disable(struct rtl8169_private *tp) { - RTL_W8(tp, MaxTxPacketSize, 0x0c); + RTL_W8(tp, MaxTxPacketSize, 0x3f); RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0); RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~0x01); } From 3d1dc719bca9988e08a8d68363a5c2514ccaf5d4 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 24 Feb 2021 14:57:06 -0800 Subject: [PATCH 386/506] parisc: select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY parisc uses -fpatchable-function-entry with dynamic ftrace, which means we don't need recordmcount. Select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY to tell that to the build system. Reported-by: Guenter Roeck Fixes: 3b15cdc15956 ("tracing: move function tracer options to Kconfig") Signed-off-by: Sami Tolvanen Tested-by: Guenter Roeck Tested-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210224225706.2726050-1-samitolvanen@google.com --- arch/parisc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 78b17621ee4a..0ebde059bd7e 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -61,6 +61,7 @@ config PARISC select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE if $(cc-option,-fpatchable-function-entry=1,1) select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE + select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE select HAVE_KPROBES_ON_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select SET_FS From 4c7858b9001c85aacf86a74b3a68aa384bc33760 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 26 Feb 2021 04:39:12 +0900 Subject: [PATCH 387/506] kbuild: Move .thinlto-cache removal to 'make clean' Instead of 'make distclean', 'make clean' should remove build artifacts unneeded by external module builds. Obviously, you do not need to keep this directory. Fixes: dc5723b02e52 ("kbuild: add support for Clang LTO") Signed-off-by: Masahiro Yamada Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210225193912.3303604-1-masahiroy@kernel.org --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 639873def1fd..b21531de24f9 100644 --- a/Makefile +++ b/Makefile @@ -1491,7 +1491,7 @@ endif # CONFIG_MODULES # Directories & files removed with 'make clean' CLEAN_FILES += include/ksym vmlinux.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps \ - compile_commands.json + compile_commands.json .thinlto-cache # Directories & files removed with 'make mrproper' MRPROPER_FILES += include/config include/generated \ @@ -1505,7 +1505,7 @@ MRPROPER_FILES += include/config include/generated \ *.spec # Directories & files removed with 'make distclean' -DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS .thinlto-cache +DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS # clean - Delete most, but leave enough to build external modules # From f5b6a74d9c08b19740ca056876bf6584acdba582 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 29 Jan 2021 17:46:51 -0700 Subject: [PATCH 388/506] vmlinux.lds.h: Define SANTIZER_DISCARDS with CONFIG_GCOV_KERNEL=y clang produces .eh_frame sections when CONFIG_GCOV_KERNEL is enabled, even when -fno-asynchronous-unwind-tables is in KBUILD_CFLAGS: $ make CC=clang vmlinux ... ld: warning: orphan section `.eh_frame' from `init/main.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/version.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/do_mounts.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/do_mounts_initrd.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/initramfs.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/calibrate.o' being placed in section `.eh_frame' ld: warning: orphan section `.eh_frame' from `init/init_task.o' being placed in section `.eh_frame' ... $ rg "GCOV_KERNEL|GCOV_PROFILE_ALL" .config CONFIG_GCOV_KERNEL=y CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y CONFIG_GCOV_PROFILE_ALL=y This was already handled for a couple of other options in commit d812db78288d ("vmlinux.lds.h: Avoid KASAN and KCSAN's unwanted sections") and there is an open LLVM bug for this issue. Take advantage of that section for this config as well so that there are no more orphan warnings. Link: https://bugs.llvm.org/show_bug.cgi?id=46478 Link: https://github.com/ClangBuiltLinux/linux/issues/1069 Reported-by: kernel test robot Reviewed-by: Fangrui Song Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Fixes: d812db78288d ("vmlinux.lds.h: Avoid KASAN and KCSAN's unwanted sections") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210130004650.2682422-1-nathan@kernel.org --- include/asm-generic/vmlinux.lds.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b97c628ad91f..ec89e657ea2f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -988,12 +988,13 @@ #endif /* - * Clang's -fsanitize=kernel-address and -fsanitize=thread produce - * unwanted sections (.eh_frame and .init_array.*), but - * CONFIG_CONSTRUCTORS wants to keep any .init_array.* sections. + * Clang's -fprofile-arcs, -fsanitize=kernel-address, and + * -fsanitize=thread produce unwanted sections (.eh_frame + * and .init_array.*), but CONFIG_CONSTRUCTORS wants to + * keep any .init_array.* sections. * https://bugs.llvm.org/show_bug.cgi?id=46478 */ -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN) +#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN) # ifdef CONFIG_CONSTRUCTORS # define SANITIZER_DISCARDS \ *(.eh_frame) From c49f50d1983d53871ecc77b60c1fa69a2a5ca6d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:25 -0800 Subject: [PATCH 389/506] mm: make pagecache tagged lookups return only head pages Patch series "Overhaul multi-page lookups for THP", v4. This THP prep patchset changes several page cache iteration APIs to only return head pages. - It's only possible to tag head pages in the page cache, so only return head pages, not all their subpages. - Factor a lot of common code out of the various batch lookup routines - Add mapping_seek_hole_data() - Unify find_get_entries() and pagevec_lookup_entries() - Make find_get_entries only return head pages, like find_get_entry(). These are only loosely connected, but they seem to make sense together as a series. This patch (of 14): Pagecache tags are used for dirty page writeback. Since dirtiness is tracked on a per-THP basis, we only want to return the head page rather than each subpage of a tagged page. All the filesystems which use huge pages today are in-memory, so there are no tagged huge pages today. Link: https://lkml.kernel.org/r/20201112212641.27837-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Hugh Dickins Cc: Johannes Weiner Cc: Yang Shi Cc: Dave Chinner Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 46a8b9e82434..57eae5163bce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2062,7 +2062,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_range_tag - find and return pages in given range matching @tag + * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search * @index: the starting page index * @end: The final page index (inclusive) @@ -2070,8 +2070,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. + * Like find_get_pages(), except we only return head pages which are tagged + * with @tag. @index is updated to the index immediately after the last + * page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2105,9 +2106,9 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { - *index = xas.xa_index + 1; + *index = page->index + thp_nr_pages(page); goto out; } continue; From 96888e0ab0e652eb3036eff0cb0664a96cb7e9a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:29 -0800 Subject: [PATCH 390/506] mm/shmem: use pagevec_lookup in shmem_unlock_mapping The comment shows that the reason for using find_get_entries() is now stale; find_get_pages() will not return 0 if it hits a consecutive run of swap entries, and I don't believe it has since 2011. pagevec_lookup() is a simpler function to use than find_get_pages(), so use it instead. Link: https://lkml.kernel.org/r/20201112212641.27837-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ff741d229701..5ea1fa53db3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -842,7 +842,6 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) void shmem_unlock_mapping(struct address_space *mapping) { struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; pagevec_init(&pvec); @@ -850,16 +849,8 @@ void shmem_unlock_mapping(struct address_space *mapping) * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = find_get_entries(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) + if (!pagevec_lookup(&pvec, mapping, &index)) break; - index = indices[pvec.nr - 1] + 1; - pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(&pvec); pagevec_release(&pvec); cond_resched(); From 8c647dd1e39573f23a4ca25c09f82716b70e702c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:33 -0800 Subject: [PATCH 391/506] mm/swap: optimise get_shadow_from_swap_cache There's no need to get a reference to the page, just load the entry and see if it's a shadow entry. Link: https://lkml.kernel.org/r/20201112212641.27837-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index c1a648d9092b..f270c30d4681 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -87,11 +87,9 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) pgoff_t idx = swp_offset(entry); struct page *page; - page = find_get_entry(address_space, idx); + page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; - if (page) - put_page(page); return NULL; } From 44835d20b2a0c9b4c0c3fb96e90f4e2fd4a4e41d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:36 -0800 Subject: [PATCH 392/506] mm: add FGP_ENTRY The functionality of find_lock_entry() and find_get_entry() can be provided by pagecache_get_page(), which lets us delete find_lock_entry() and make find_get_entry() static. Link: https://lkml.kernel.org/r/20201112212641.27837-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 + mm/filemap.c | 44 ++++++++--------------------------------- mm/internal.h | 3 --- mm/shmem.c | 3 ++- mm/swap_state.c | 3 ++- 5 files changed, 13 insertions(+), 41 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bd629d676a27..b379b2388202 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -315,6 +315,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 +#define FGP_ENTRY 0x00000100 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); diff --git a/mm/filemap.c b/mm/filemap.c index 57eae5163bce..84b7813badf1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1658,7 +1658,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, } EXPORT_SYMBOL(page_cache_prev_miss); -/** +/* * find_get_entry - find and get a page cache entry * @mapping: the address_space to search * @index: The page cache index. @@ -1671,7 +1671,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1707,39 +1707,6 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t index) return page; } -/** - * find_lock_entry - Locate and lock a page cache entry. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * Looks up the page at @mapping & @index. If there is a page in the - * cache, the head page is returned locked and with an increased refcount. - * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Context: May sleep. - * Return: The head page or shadow entry, %NULL if nothing is found. - */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) -{ - struct page *page; - -repeat: - page = find_get_entry(mapping, index); - if (page && !xa_is_value(page)) { - lock_page(page); - /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); - } - return page; -} - /** * pagecache_get_page - Find and get a reference to a page. * @mapping: The address_space to search. @@ -1755,6 +1722,8 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) * * %FGP_LOCK - The page is returned locked. * * %FGP_HEAD - If the page is present and a THP, return the head page * rather than the exact page specified by the index. + * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it + * instead of allocating a new page to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1779,8 +1748,11 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, repeat: page = find_get_entry(mapping, index); - if (xa_is_value(page)) + if (xa_is_value(page)) { + if (fgp_flags & FGP_ENTRY) + return page; page = NULL; + } if (!page) goto no_page; diff --git a/mm/internal.h b/mm/internal.h index 25d2b2439f19..eed74f1e6147 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -60,9 +60,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t index); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); - /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/shmem.c b/mm/shmem.c index 5ea1fa53db3f..bd5bb78128af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1812,7 +1812,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, sbinfo = SHMEM_SB(inode->i_sb); charge_mm = vma ? vma->vm_mm : current->mm; - page = find_lock_entry(mapping, index); + page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); if (xa_is_value(page)) { error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); diff --git a/mm/swap_state.c b/mm/swap_state.c index f270c30d4681..3cdee7b11da9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -403,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = find_get_entry(mapping, index); + struct page *page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); if (!page) return page; From bc5a301120f35caf0cd6cfdff7efa0fa779749c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:40 -0800 Subject: [PATCH 393/506] mm/filemap: rename find_get_entry to mapping_get_entry find_get_entry doesn't "find" anything. It returns the entry at a particular index. Link: https://lkml.kernel.org/r/20201112212641.27837-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 84b7813badf1..087308cf17ba 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1659,7 +1659,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_prev_miss); /* - * find_get_entry - find and get a page cache entry + * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * @@ -1671,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -static struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *mapping_get_entry(struct address_space *mapping, + pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1747,7 +1748,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct page *page; repeat: - page = find_get_entry(mapping, index); + page = mapping_get_entry(mapping, index); if (xa_is_value(page)) { if (fgp_flags & FGP_ENTRY) return page; From c7bad633e6b749b2d64e2421cc9d4ee0d1540a8a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:44 -0800 Subject: [PATCH 394/506] mm/filemap: add helper for finding pages There is a lot of common code in find_get_entries(), find_get_pages_range() and find_get_pages_range_tag(). Factor out find_get_entry() which simplifies all three functions. [willy@infradead.org: remove VM_BUG_ON_PAGE()] Link: https://lkml.kernel.org/r/20201124041507.28996-2-willy@infradead.orgLink: https://lkml.kernel.org/r/20201112212641.27837-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 97 +++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 087308cf17ba..21443850aeae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1825,6 +1825,42 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, } EXPORT_SYMBOL(pagecache_get_page); +static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark) +{ + struct page *page; + +retry: + if (mark == XA_PRESENT) + page = xas_find(xas, max); + else + page = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, page)) + goto retry; + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + return page; + + if (!page_cache_get_speculative(page)) + goto reset; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) { + put_page(page); + goto reset; + } + + return page; +reset: + xas_reset(xas); + goto retry; +} + /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search @@ -1864,42 +1900,21 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - + while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (!xa_is_value(page) && PageTransHuge(page) && + !PageHuge(page)) { page = find_subpage(page, xas.xa_index); nr_entries = ret + 1; } -export: + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1938,30 +1953,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - xas_for_each(&xas, page, end) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2061,9 +2062,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - xas_for_each_marked(&xas, page, end, tag) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict @@ -2072,23 +2071,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = page; if (++ret == nr_pages) { *index = page->index + thp_nr_pages(page); goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* From 41139aa4c3a31ee7e072fc63353c74035aade2ff Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:48 -0800 Subject: [PATCH 395/506] mm/filemap: add mapping_seek_hole_data Rewrite shmem_seek_hole_data() and move it to filemap.c. [willy@infradead.org: don't put an xa_is_value() page] Link: https://lkml.kernel.org/r/20201124041507.28996-4-willy@infradead.org Link: https://lkml.kernel.org/r/20201112212641.27837-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 76 +++++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 74 +++------------------------------------ 3 files changed, 82 insertions(+), 70 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b379b2388202..3608993428d9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -760,6 +760,8 @@ extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, + int whence); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/filemap.c b/mm/filemap.c index 21443850aeae..eff3006be12a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,6 +2553,82 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +static inline bool page_seek_match(struct page *page, bool seek_data) +{ + if (xa_is_value(page) || PageUptodate(page)) + return seek_data; + return !seek_data; +} + +static inline +unsigned int seek_page_size(struct xa_state *xas, struct page *page) +{ + if (xa_is_value(page)) + return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return thp_size(page); +} + +/** + * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * @mapping: Address space to search. + * @start: First byte to consider. + * @end: Limit of search (exclusive). + * @whence: Either SEEK_HOLE or SEEK_DATA. + * + * If the page cache knows which blocks contain holes and which blocks + * contain data, your filesystem can use this function to implement + * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are + * entirely memory-based such as tmpfs, and filesystems which support + * unwritten extents. + * + * Return: The requested offset on successs, or -ENXIO if @whence specifies + * SEEK_DATA and there is no data after @start. There is an implicit hole + * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start + * and @end contain data. + */ +loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + loff_t end, int whence) +{ + XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); + pgoff_t max = (end - 1) / PAGE_SIZE; + bool seek_data = (whence == SEEK_DATA); + struct page *page; + + if (end <= start) + return -ENXIO; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + loff_t pos = xas.xa_index * PAGE_SIZE; + + if (start < pos) { + if (!seek_data) + goto unlock; + start = pos; + } + + if (page_seek_match(page, seek_data)) + goto unlock; + start = pos + seek_page_size(&xas, page); + if (!xa_is_value(page)) + put_page(page); + } + rcu_read_unlock(); + + if (seek_data) + return -ENXIO; + goto out; + +unlock: + rcu_read_unlock(); + if (!xa_is_value(page)) + put_page(page); +out: + if (start > end) + return end; + return start; +} + #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* diff --git a/mm/shmem.c b/mm/shmem.c index bd5bb78128af..deb22e128435 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2668,86 +2668,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the page cache. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int whence) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = find_get_entries(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (whence == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (whence == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !xa_is_value(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && whence == SEEK_DATA) || - (!page && whence == SEEK_HOLE)) { - done = true; - break; - } - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t start, end; - loff_t new_offset; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); + if (offset < 0) + return -ENXIO; + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0 || offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_SHIFT; - end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (whence == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); From 54fa39ac2e00b1b8c2a7fe72e648773ffa48f76d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:52 -0800 Subject: [PATCH 396/506] iomap: use mapping_seek_hole_data Enhance mapping_seek_hole_data() to handle partially uptodate pages and convert the iomap seek code to call it. Link: https://lkml.kernel.org/r/20201112212641.27837-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap/seek.c | 125 +++++------------------------------------------- mm/filemap.c | 37 ++++++++++++-- 2 files changed, 43 insertions(+), 119 deletions(-) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 107ee80c3568..dab1b02eba5b 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,122 +10,17 @@ #include #include -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * Returns true if found and updates @lastoff to the offset in file. - */ -static bool -page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, - int whence) -{ - const struct address_space_operations *ops = inode->i_mapping->a_ops; - unsigned int bsize = i_blocksize(inode), off; - bool seek_data = whence == SEEK_DATA; - loff_t poff = page_offset(page); - - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) - return false; - - if (*lastoff < poff) { - /* - * Last offset smaller than the start of the page means we found - * a hole: - */ - if (whence == SEEK_HOLE) - return true; - *lastoff = poff; - } - - /* - * Just check the page unless we can and should check block ranges: - */ - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) - return PageUptodate(page) == seek_data; - - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) - goto out_unlock_not_found; - - for (off = 0; off < PAGE_SIZE; off += bsize) { - if (offset_in_page(*lastoff) >= off + bsize) - continue; - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { - unlock_page(page); - return true; - } - *lastoff = poff + off + bsize; - } - -out_unlock_not_found: - unlock_page(page); - return false; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: uptodate buffer heads count as data; everything else - * counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ static loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page_seek_hole_data(inode, page, &lastoff, whence)) - goto check_range; - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - - -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_HOLE); - if (offset < 0) + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_HOLE); + if (offset == start + length) return length; fallthrough; case IOMAP_HOLE: @@ -164,15 +59,17 @@ iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_DATA); + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_DATA); if (offset < 0) return length; fallthrough; diff --git a/mm/filemap.c b/mm/filemap.c index eff3006be12a..6a34f94adf3b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,11 +2553,36 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline bool page_seek_match(struct page *page, bool seek_data) +static inline loff_t page_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct page *page, + loff_t start, loff_t end, bool seek_data) { + const struct address_space_operations *ops = mapping->a_ops; + size_t offset, bsz = i_blocksize(mapping->host); + if (xa_is_value(page) || PageUptodate(page)) - return seek_data; - return !seek_data; + return seek_data ? start : end; + if (!ops->is_partially_uptodate) + return seek_data ? end : start; + + xas_pause(xas); + rcu_read_unlock(); + lock_page(page); + if (unlikely(page->mapping != mapping)) + goto unlock; + + offset = offset_in_thp(page, start) & ~(bsz - 1); + + do { + if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + break; + start = (start + bsz) & ~(bsz - 1); + offset += bsz; + } while (offset < thp_size(page)); +unlock: + unlock_page(page); + rcu_read_lock(); + return start; } static inline @@ -2607,9 +2632,11 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - if (page_seek_match(page, seek_data)) + pos += seek_page_size(&xas, page); + start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_data); + if (start < pos) goto unlock; - start = pos + seek_page_size(&xas, page); if (!xa_is_value(page)) put_page(page); } From 5c211ba29deb84e647b3a87207c8714efd9c11d5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:15:56 -0800 Subject: [PATCH 397/506] mm: add and use find_lock_entries We have three functions (shmem_undo_range(), truncate_inode_pages_range() and invalidate_mapping_pages()) which want exactly this function, so add it to filemap.c. Before this patch, shmem_undo_range() would split any compound page which overlaps either end of the range being punched in both the first and second loops through the address space. After this patch, that functionality is left for the second loop, which is arguably more appropriate since the first loop is supposed to run through all the pages quickly, and splitting a page can sleep. [willy@infradead.org: add assertion] Link: https://lkml.kernel.org/r/20201124041507.28996-3-willy@infradead.org Link: https://lkml.kernel.org/r/20201112212641.27837-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 59 +++++++++++++++++++++++++++++++++ mm/internal.h | 3 ++ mm/shmem.c | 22 +++---------- mm/truncate.c | 91 +++++++-------------------------------------------- 4 files changed, 78 insertions(+), 97 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 6a34f94adf3b..61fdcdc75275 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1920,6 +1920,65 @@ unsigned find_get_entries(struct address_space *mapping, return ret; } +/** + * find_lock_entries - Find a batch of pagecache entries. + * @mapping: The address_space to search. + * @start: The starting page cache index. + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @pvec. + * + * find_lock_entries() will return a batch of entries from @mapping. + * Swap, shadow and DAX entries are included. Pages are returned + * locked and with an incremented refcount. Pages which are locked by + * somebody else or under writeback are skipped. Only the head page of + * a THP is returned. Pages which are partially outside the range are + * not returned. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries, THP pages, pages which could not be locked + * or pages under writeback. + * + * Return: The number of entries which were found. + */ +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(page)) { + if (page->index < start) + goto put; + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); + if (page->index + thp_nr_pages(page) - 1 > end) + goto put; + if (!trylock_page(page)) + goto put; + if (page->mapping != mapping || PageWriteback(page)) + goto unlock; + VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), + page); + } + indices[pvec->nr] = xas.xa_index; + if (!pagevec_add(pvec, page)) + break; + goto next; +unlock: + unlock_page(page); +put: + put_page(page); +next: + if (!xa_is_value(page) && PageTransHuge(page)) + xas_set(&xas, page->index + thp_nr_pages(page)); + } + rcu_read_unlock(); + + return pagevec_count(pvec); +} + /** * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search diff --git a/mm/internal.h b/mm/internal.h index eed74f1e6147..9902648f2206 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -60,6 +60,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/shmem.c b/mm/shmem.c index deb22e128435..86b1f5bc502c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -907,12 +907,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec); index = start; - while (index < end) { - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) - break; + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -927,18 +923,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index, page); continue; } + index += thp_nr_pages(page) - 1; - VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); - - if (!trylock_page(page)) - continue; - - if ((!unfalloc || !PageUptodate(page)) && - page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - } + if (!unfalloc || !PageUptodate(page)) + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); diff --git a/mm/truncate.c b/mm/truncate.c index 8aa4907e06e0..de7f4f47f780 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -326,51 +326,19 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - /* - * Pagevec array has exceptional entries and we may also fail - * to lock some pages. So we store pages that can be deleted - * in a new pagevec. - */ - struct pagevec locked_pvec; - - pagevec_init(&locked_pvec); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - - /* We rely upon deletion not changing page->index */ - index = indices[i]; - if (index >= end) - break; - - if (xa_is_value(page)) - continue; - - if (!trylock_page(page)) - continue; - WARN_ON(page_to_index(page) != index); - if (PageWriteback(page)) { - unlock_page(page); - continue; - } - if (page->mapping != mapping) { - unlock_page(page); - continue; - } - pagevec_add(&locked_pvec, page); - } - for (i = 0; i < pagevec_count(&locked_pvec); i++) - truncate_cleanup_page(mapping, locked_pvec.pages[i]); - delete_from_page_cache_batch(mapping, &locked_pvec); - for (i = 0; i < pagevec_count(&locked_pvec); i++) - unlock_page(locked_pvec.pages[i]); + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { + index = indices[pagevec_count(&pvec) - 1] + 1; truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + for (i = 0; i < pagevec_count(&pvec); i++) + truncate_cleanup_page(mapping, pvec.pages[i]); + delete_from_page_cache_batch(mapping, &pvec); + for (i = 0; i < pagevec_count(&pvec); i++) + unlock_page(pvec.pages[i]); pagevec_release(&pvec); cond_resched(); - index++; } + if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -539,9 +507,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, int i; pagevec_init(&pvec); - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (find_lock_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -555,39 +521,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, page); continue; } - - if (!trylock_page(page)) - continue; - - WARN_ON(page_to_index(page) != index); - - /* Middle of THP: skip */ - if (PageTransTail(page)) { - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - /* - * 'end' is in the middle of THP. Don't - * invalidate the page as the part outside of - * 'end' could be still useful. - */ - if (index > end) { - unlock_page(page); - continue; - } - - /* Take a pin outside pagevec */ - get_page(page); - - /* - * Drop extra pins before trying to invalidate - * the huge page. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - } + index += thp_nr_pages(page) - 1; ret = invalidate_inode_page(page); unlock_page(page); @@ -601,9 +535,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, if (nr_pagevec) (*nr_pagevec)++; } - - if (PageTransHuge(page)) - put_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); From ca122fe40eb463c8c11c3bfc1914f0048ca5c268 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:00 -0800 Subject: [PATCH 398/506] mm: add an 'end' parameter to find_get_entries This simplifies the callers and leads to a more efficient implementation since the XArray has this functionality already. Link: https://lkml.kernel.org/r/20201112212641.27837-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- mm/filemap.c | 9 +++++---- mm/shmem.c | 10 ++-------- mm/swap.c | 2 +- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3608993428d9..fdb2c4e44851 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -451,8 +451,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - unsigned int nr_entries, struct page **entries, - pgoff_t *indices); + pgoff_t end, unsigned int nr_entries, struct page **entries, + pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index 61fdcdc75275..65cfdff17ac6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1865,6 +1865,7 @@ static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index + * @end: The final page index (inclusive). * @nr_entries: The maximum number of entries * @entries: Where the resulting entries are placed * @indices: The cache indices corresponding to the entries in @entries @@ -1888,9 +1889,9 @@ static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, * * Return: the number of pages and shadow entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, unsigned int nr_entries, struct page **entries, + pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; @@ -1900,7 +1901,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) { + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. diff --git a/mm/shmem.c b/mm/shmem.c index 86b1f5bc502c..4aac760aa2d4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -913,8 +913,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; if (xa_is_value(page)) { if (unfalloc) @@ -967,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, end - 1, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -982,9 +979,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; - if (xa_is_value(page)) { if (unfalloc) continue; diff --git a/mm/swap.c b/mm/swap.c index ab3258afcbeb..c5773a84feab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1046,7 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, pgoff_t start, unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, nr_entries, + pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } From 31d270fd98d196578223e5b568a0bd3bc6028b09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:03 -0800 Subject: [PATCH 399/506] mm: add an 'end' parameter to pagevec_lookup_entries Simplifies the callers and uses the existing functionality in find_get_entries(). We can also drop the final argument of truncate_exceptional_pvec_entries() and simplify the logic in that function. Link: https://lkml.kernel.org/r/20201112212641.27837-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 5 ++--- mm/swap.c | 8 ++++---- mm/truncate.c | 41 ++++++++++------------------------------- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ad4ddc17d403..f70a9dc81504 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,9 +26,8 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, - pgoff_t start, unsigned nr_entries, - pgoff_t *indices); + struct address_space *mapping, pgoff_t start, pgoff_t end, + unsigned nr_entries, pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index c5773a84feab..db8c354264a5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1022,6 +1022,7 @@ void __pagevec_lru_add(struct pagevec *pvec) * @pvec: Where the resulting entries are placed * @mapping: The address_space to search * @start: The starting entry index + * @end: The highest index to return (inclusive). * @nr_entries: The maximum number of pages * @indices: The cache indices corresponding to the entries in @pvec * @@ -1042,11 +1043,10 @@ void __pagevec_lru_add(struct pagevec *pvec) * found. */ unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, - pgoff_t start, unsigned nr_entries, - pgoff_t *indices) + struct address_space *mapping, pgoff_t start, pgoff_t end, + unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries, + pvec->nr = find_get_entries(mapping, start, end, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } diff --git a/mm/truncate.c b/mm/truncate.c index de7f4f47f780..60df23890c2d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, * exceptional entries similar to what pagevec_remove_exceptionals does. */ static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices, - pgoff_t end) + struct pagevec *pvec, pgoff_t *indices) { int i, j; - bool dax, lock; + bool dax; /* Handled by shmem itself */ if (shmem_mapping(mapping)) @@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; dax = dax_mapping(mapping); - lock = !dax && indices[j] < end; - if (lock) + if (!dax) xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { @@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - if (index >= end) - continue; - if (unlikely(dax)) { dax_delete_mapping_entry(mapping, index); continue; @@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, __clear_shadow_entry(mapping, index, page); } - if (lock) + if (!dax) xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -329,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && find_lock_entries(mapping, index, end - 1, &pvec, indices)) { index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + truncate_exceptional_pvec_entries(mapping, &pvec, indices); for (i = 0; i < pagevec_count(&pvec); i++) truncate_cleanup_page(mapping, pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec); @@ -381,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, + PAGEVEC_SIZE, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; @@ -390,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (index == start && indices[0] >= end) { - /* All gone out of hole to be punched, we're done */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - break; - } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index >= end) { - /* Restart punch to make sure all gone */ - index = start - 1; - break; - } if (xa_is_value(page)) continue; @@ -417,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + truncate_exceptional_pvec_entries(mapping, &pvec, indices); pagevec_release(&pvec); index++; } @@ -513,8 +497,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, @@ -656,16 +638,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (pagevec_lookup_entries(&pvec, mapping, index, end, + PAGEVEC_SIZE, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, From 38cefeb33749992ceaad6ea40e12f92aa8f8e28f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:07 -0800 Subject: [PATCH 400/506] mm: remove nr_entries parameter from pagevec_lookup_entries All callers want to fetch the full size of the pvec. Link: https://lkml.kernel.org/r/20201112212641.27837-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 2 +- mm/swap.c | 4 ++-- mm/truncate.c | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index f70a9dc81504..72c5ea2e708d 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,7 +27,7 @@ void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, - unsigned nr_entries, pgoff_t *indices); + pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index db8c354264a5..cd9e1ed7e78f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1044,9 +1044,9 @@ void __pagevec_lru_add(struct pagevec *pvec) */ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, - unsigned nr_entries, pgoff_t *indices) + pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, end, nr_entries, + pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE, pvec->pages, indices); return pagevec_count(pvec); } diff --git a/mm/truncate.c b/mm/truncate.c index 60df23890c2d..41e7377ad58d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -377,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping, for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, - PAGEVEC_SIZE, indices)) { + indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; @@ -638,8 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (pagevec_lookup_entries(&pvec, mapping, index, end, - PAGEVEC_SIZE, indices)) { + while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; From cf2039af1a2eee58fdbfa68bc0c9123e77477645 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:11 -0800 Subject: [PATCH 401/506] mm: pass pvec directly to find_get_entries All callers of find_get_entries() use a pvec, so pass it directly instead of manipulating it in the caller. Link: https://lkml.kernel.org/r/20201112212641.27837-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Cc: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 21 +++++++++------------ mm/shmem.c | 5 ++--- mm/swap.c | 4 +--- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fdb2c4e44851..20225b067583 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -451,8 +451,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, unsigned int nr_entries, struct page **entries, - pgoff_t *indices); + pgoff_t end, struct pagevec *pvec, pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index 65cfdff17ac6..43700480d897 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1866,14 +1866,12 @@ static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @nr_entries: The maximum number of entries - * @entries: Where the resulting entries are placed + * @pvec: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * - * find_get_entries() will search for and return a group of up to - * @nr_entries entries in the mapping. The entries are placed at - * @entries. find_get_entries() takes a reference against any actual - * pages it returns. + * find_get_entries() will search for and return a batch of entries in + * the mapping. The entries are placed in @pvec. find_get_entries() + * takes a reference on any actual pages it returns. * * The search returns a group of mapping-contiguous page cache entries * with ascending indexes. There may be holes in the indices due to @@ -1890,15 +1888,12 @@ static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, unsigned int nr_entries, struct page **entries, - pgoff_t *indices) + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned int ret = 0; - - if (!nr_entries) - return 0; + unsigned nr_entries = PAGEVEC_SIZE; rcu_read_lock(); while ((page = find_get_entry(&xas, end, XA_PRESENT))) { @@ -1913,11 +1908,13 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, } indices[ret] = xas.xa_index; - entries[ret] = page; + pvec->pages[ret] = page; if (++ret == nr_entries) break; } rcu_read_unlock(); + + pvec->nr = ret; return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 4aac760aa2d4..ee8f21832f98 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -965,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, end - 1, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) { + if (!find_get_entries(mapping, index, end - 1, &pvec, + indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; diff --git a/mm/swap.c b/mm/swap.c index cd9e1ed7e78f..d20a746a831e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1046,9 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, pgoff_t end, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE, - pvec->pages, indices); - return pagevec_count(pvec); + return find_get_entries(mapping, start, end, pvec, indices); } /** From a656a20241f08be532539c7d5bd82df741c2d487 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 17:16:14 -0800 Subject: [PATCH 402/506] mm: remove pagevec_lookup_entries pagevec_lookup_entries() is now just a wrapper around find_get_entries() so remove it and convert all its callers. Link: https://lkml.kernel.org/r/20201112212641.27837-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 --- mm/swap.c | 36 ++---------------------------------- mm/truncate.c | 4 ++-- 3 files changed, 4 insertions(+), 39 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 72c5ea2e708d..7f3f19065a9f 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -25,9 +25,6 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); -unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, pgoff_t start, pgoff_t end, - pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, diff --git a/mm/swap.c b/mm/swap.c index d20a746a831e..31b844d4ed94 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1017,44 +1017,12 @@ void __pagevec_lru_add(struct pagevec *pvec) pagevec_reinit(pvec); } -/** - * pagevec_lookup_entries - gang pagecache lookup - * @pvec: Where the resulting entries are placed - * @mapping: The address_space to search - * @start: The starting entry index - * @end: The highest index to return (inclusive). - * @nr_entries: The maximum number of pages - * @indices: The cache indices corresponding to the entries in @pvec - * - * pagevec_lookup_entries() will search for and return a group of up - * to @nr_pages pages and shadow entries in the mapping. All - * entries are placed in @pvec. pagevec_lookup_entries() takes a - * reference against actual pages in @pvec. - * - * The search returns a group of mapping-contiguous entries with - * ascending indexes. There may be holes in the indices due to - * not-present entries. - * - * Only one subpage of a Transparent Huge Page is returned in one call: - * allowing truncate_inode_pages_range() to evict the whole THP without - * cycling through a pagevec of extra references. - * - * pagevec_lookup_entries() returns the number of entries which were - * found. - */ -unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, pgoff_t start, pgoff_t end, - pgoff_t *indices) -{ - return find_get_entries(mapping, start, end, pvec, indices); -} - /** * pagevec_remove_exceptionals - pagevec exceptionals pruning * @pvec: The pagevec to prune * - * pagevec_lookup_entries() fills both pages and exceptional radix - * tree entries into the pagevec. This function prunes all + * find_get_entries() fills both pages and XArray value entries (aka + * exceptional entries) into the pagevec. This function prunes all * exceptionals from @pvec without leaving holes, so that it can be * passed on to page-only pagevec operations. */ diff --git a/mm/truncate.c b/mm/truncate.c index 41e7377ad58d..455944264663 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -376,7 +376,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1, + if (!find_get_entries(mapping, index, end - 1, &pvec, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -638,7 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) { + while (find_get_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; From 164cc4fef4456727466f8e35bb654c3994748070 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:18 -0800 Subject: [PATCH 403/506] mm,thp,shmem: limit shmem THP alloc gfp_mask Patch series "mm,thp,shm: limit shmem THP alloc gfp_mask", v6. The allocation flags of anonymous transparent huge pages can be controlled through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can help the system from getting bogged down in the page reclaim and compaction code when many THPs are getting allocated simultaneously. However, the gfp_mask for shmem THP allocations were not limited by those configuration settings, and some workloads ended up with all CPUs stuck on the LRU lock in the page reclaim code, trying to allocate dozens of THPs simultaneously. This patch applies the same configurated limitation of THPs to shmem hugepage allocations, to prevent that from happening. This way a THP defrag setting of "never" or "defer+madvise" will result in quick allocation failures without direct reclaim when no 2MB free pages are available. With this patch applied, THP allocations for tmpfs will be a little more aggressive than today for files mmapped with MADV_HUGEPAGE, and a little less aggressive for files that are not mmapped or mapped without that flag. This patch (of 4): The allocation flags of anonymous transparent huge pages can be controlled through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can help the system from getting bogged down in the page reclaim and compaction code when many THPs are getting allocated simultaneously. However, the gfp_mask for shmem THP allocations were not limited by those configuration settings, and some workloads ended up with all CPUs stuck on the LRU lock in the page reclaim code, trying to allocate dozens of THPs simultaneously. This patch applies the same configurated limitation of THPs to shmem hugepage allocations, to prevent that from happening. Controlling the gfp_mask of THP allocations through the knobs in sysfs allows users to determine the balance between how aggressively the system tries to allocate THPs at fault time, and how much the application may end up stalling attempting those allocations. This way a THP defrag setting of "never" or "defer+madvise" will result in quick allocation failures without direct reclaim when no 2MB free pages are available. With this patch applied, THP allocations for tmpfs will be a little more aggressive than today for files mmapped with MADV_HUGEPAGE, and a little less aggressive for files that are not mmapped or mapped without that flag. Link: https://lkml.kernel.org/r/20201124194925.623931-1-riel@surriel.com Link: https://lkml.kernel.org/r/20201124194925.623931-2-riel@surriel.com Signed-off-by: Rik van Riel Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Xu Yu Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ mm/huge_memory.c | 6 +++--- mm/shmem.c | 8 +++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 220cd553a9e7..8572a1474e16 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -634,6 +634,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); + #ifdef CONFIG_PM_SLEEP extern bool pm_suspended_storage(void); #else diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d77605c30f2e..395c75111d33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -668,9 +668,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) @@ -762,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); + gfp = vma_thp_gfp_mask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/shmem.c b/mm/shmem.c index ee8f21832f98..596009a44431 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1519,8 +1519,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), + true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1776,6 +1776,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; + gfp_t huge_gfp; int error; int once = 0; int alloced = 0; @@ -1862,7 +1863,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + huge_gfp = vma_thp_gfp_mask(vma); + page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, From 78cc8cdc54008f54b79711fc027afc3564588a04 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:22 -0800 Subject: [PATCH 404/506] mm,thp,shm: limit gfp mask to no more than specified Matthew Wilcox pointed out that the i915 driver opportunistically allocates tmpfs memory, but will happily reclaim some of its pool if no memory is available. Make sure the gfp mask used to opportunistically allocate a THP is always at least as restrictive as the original gfp mask. Link: https://lkml.kernel.org/r/20201124194925.623931-3-riel@surriel.com Signed-off-by: Rik van Riel Suggested-by: Matthew Wilcox Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 596009a44431..06c771d23127 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1505,6 +1505,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t result = huge_gfp & ~allowflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { @@ -1864,6 +1884,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: From cd89fb06509903f942a0ffe97ffa63034671ed0c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:25 -0800 Subject: [PATCH 405/506] mm,thp,shmem: make khugepaged obey tmpfs mount flags Currently if thp enabled=[madvise], mounting a tmpfs filesystem with huge=always and mmapping files from that tmpfs does not result in khugepaged collapsing those mappings, despite the mount flag indicating that it should. Fix that by breaking up the blocks of tests in hugepage_vma_check a little bit, and testing things in the correct order. Link: https://lkml.kernel.org/r/20201124194925.623931-4-riel@surriel.com Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem") Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 2 ++ mm/khugepaged.c | 22 ++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index c941b7377321..2fcc01891b47 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -3,6 +3,7 @@ #define _LINUX_KHUGEPAGED_H #include /* MMF_VM_HUGEPAGE */ +#include #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || + (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 75e246f680f4..a7d6cb912b05 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vm_flags & VM_NOHUGEPAGE) || + /* Explicitly disabled through madvise. */ + if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file) || - (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - vma->vm_file && - (vm_flags & VM_DENYWRITE))) { + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Read-only file mappings need to be aligned for THP to work. */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + (vm_flags & VM_DENYWRITE)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) return false; if (vma_is_temporary_stack(vma)) From 187df5dde943ae28f260db7377467ffb3b51a6de Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Feb 2021 17:16:29 -0800 Subject: [PATCH 406/506] mm,shmem,thp: limit shmem THP allocations to requested zones Hugh pointed out that the gma500 driver uses shmem pages, but needs to limit them to the DMA32 zone. Ensure the allocations resulting from the gfp_mask returned by limit_gfp_mask use the zone flags that were originally passed to shmem_getpage_gfp. Link: https://lkml.kernel.org/r/20210224121016.1314ed6d@imladris.surriel.com Signed-off-by: Rik van Riel Suggested-by: Hugh Dickins Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xu Yu Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 06c771d23127..b2db4ed0fbc7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,7 +1513,11 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; - gfp_t result = huge_gfp & ~allowflags; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, From df2ff39e78da74dc23e7187dd58a784d91a876e0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 25 Feb 2021 17:16:33 -0800 Subject: [PATCH 407/506] mm: cma: allocate cma areas bottom-up Currently cma areas without a fixed base are allocated close to the end of the node. This placement is sub-optimal because of compaction: it brings pages into the cma area. In particular, it can bring in hot executable pages, even if there is a plenty of free memory on the machine. This results in cma allocation failures. Instead let's place cma areas close to the beginning of a node. In this case the compaction will help to free cma areas, resulting in better cma allocation success rates. If there is enough memory let's try to allocate bottom-up starting with 4GB to exclude any possible interference with DMA32. On smaller machines or in a case of a failure, stick with the old behavior. 16GB vm, 2GB cma area: With this patch: [ 0.000000] Command line: root=/dev/vda3 rootflags=subvol=/root systemd.unified_cgroup_hierarchy=1 enforcing=0 console=ttyS0,115200 hugetlb_cma=2G [ 0.002928] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.002930] cma: Reserved 2048 MiB at 0x0000000100000000 [ 0.002931] hugetlb_cma: reserved 2048 MiB on node 0 Without this patch: [ 0.000000] Command line: root=/dev/vda3 rootflags=subvol=/root systemd.unified_cgroup_hierarchy=1 enforcing=0 console=ttyS0,115200 hugetlb_cma=2G [ 0.002930] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.002933] cma: Reserved 2048 MiB at 0x00000003c0000000 [ 0.002934] hugetlb_cma: reserved 2048 MiB on node 0 v2: - switched to memblock_set_bottom_up(true), by Mike - start with 4GB, by Mike [guro@fb.com: whitespace fix, per Mike] Link: https://lkml.kernel.org/r/20201221170551.GB3428478@carbon.DHCP.thefacebook.com [guro@fb.com: fix 32-bit warnings] Link: https://lkml.kernel.org/r/20201223163537.GA4011967@carbon.DHCP.thefacebook.com [guro@fb.com: fix 32-bit systems] [akpm@linux-foundation.org: build fix] Link: https://lkml.kernel.org/r/20201217201214.3414100-1-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Mike Rapoport Cc: Wonhyuk Yang Cc: Joonsoo Kim Cc: Rik van Riel Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/cma.c b/mm/cma.c index 20c4f6f40037..0ba69cd16aeb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -336,6 +336,23 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, limit = highmem_start; } + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, alignment, SZ_4G, + limit, nid, true); + memblock_set_bottom_up(false); + } +#endif + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); From 072355c1cf2d4f37993bcfc5894e17d0b11bb290 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:16:37 -0800 Subject: [PATCH 408/506] mm/cma: expose all pages to the buddy if activation of an area fails Right now, if activation fails, we might already have exposed some pages to the buddy for CMA use (although they will never get actually used by CMA), and some pages won't be exposed to the buddy at all. Let's check for "single zone" early and on error, don't expose any pages for CMA use - instead, expose them to the buddy available for any use. Simply call free_reserved_page() on every single page - easier than going via free_reserved_area(), converting back and forth between pfns and virt addresses. In addition, make sure to fixup totalcma_pages properly. Example: 6 GiB QEMU VM with "... hugetlb_cma=2G movablecore=20% ...": [ 0.006891] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node [ 0.006893] cma: Reserved 2048 MiB at 0x0000000100000000 [ 0.006893] hugetlb_cma: reserved 2048 MiB on node 0 ... [ 0.175433] cma: CMA area hugetlb0 could not be activated Before this patch: # cat /proc/meminfo MemTotal: 5867348 kB MemFree: 5692808 kB MemAvailable: 5542516 kB ... CmaTotal: 2097152 kB CmaFree: 1884160 kB After this patch: # cat /proc/meminfo MemTotal: 6077308 kB MemFree: 5904208 kB MemAvailable: 5747968 kB ... CmaTotal: 0 kB CmaFree: 0 kB Note: cma_init_reserved_mem() makes sure that we always cover full pageblocks / MAX_ORDER - 1 pages. Link: https://lkml.kernel.org/r/20210127101813.6370-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 0ba69cd16aeb..23d4a97c834a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -94,34 +94,29 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static void __init cma_activate_area(struct cma *cma) { - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; + unsigned long base_pfn = cma->base_pfn, pfn; struct zone *zone; cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) goto out_error; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); + /* + * alloc_contig_range() requires the pfn range specified to be in the + * same zone. Simplify by forcing the entire CMA resv range to be in the + * same zone. + */ + WARN_ON_ONCE(!pfn_valid(base_pfn)); + zone = page_zone(pfn_to_page(base_pfn)); + for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + if (page_zone(pfn_to_page(pfn)) != zone) + goto not_in_zone; + } - do { - unsigned j; - - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); + for (pfn = base_pfn; pfn < base_pfn + cma->count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); mutex_init(&cma->lock); @@ -135,6 +130,10 @@ static void __init cma_activate_area(struct cma *cma) not_in_zone: bitmap_free(cma->bitmap); out_error: + /* Expose all pages to the buddy, they are useless for CMA. */ + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); return; From 3c381db1fac80373f2cc0d8c1d0bcfbf8bd4fb57 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:16:40 -0800 Subject: [PATCH 409/506] mm/page_alloc: count CMA pages per zone and print them in /proc/zoneinfo Let's count the number of CMA pages per zone and print them in /proc/zoneinfo. Having access to the total number of CMA pages per zone is helpful for debugging purposes to know where exactly the CMA pages ended up, and to figure out how many pages of a zone might behave differently, even after some of these pages might already have been allocated. As one example, CMA pages part of a kernel zone cannot be used for ordinary kernel allocations but instead behave more like ZONE_MOVABLE. For now, we are only able to get the global nr+free cma pages from /proc/meminfo and the free cma pages per zone from /proc/zoneinfo. Example after this patch when booting a 6 GiB QEMU VM with "hugetlb_cma=2G": # cat /proc/zoneinfo | grep cma cma 0 nr_free_cma 0 cma 0 nr_free_cma 0 cma 524288 nr_free_cma 493016 cma 0 cma 0 # cat /proc/meminfo | grep Cma CmaTotal: 2097152 kB CmaFree: 1972064 kB Note: We print even without CONFIG_CMA, just like "nr_free_cma"; this way, one can be sure when spotting "cma 0", that there are definetly no CMA pages located in a zone. [david@redhat.com: v2] Link: https://lkml.kernel.org/r/20210128164533.18566-1-david@redhat.com [david@redhat.com: v3] Link: https://lkml.kernel.org/r/20210129113451.22085-1-david@redhat.com Link: https://lkml.kernel.org/r/20210127101813.6370-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: David Rientjes Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 15 +++++++++++++++ mm/page_alloc.c | 1 + mm/vmstat.c | 6 ++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9198b7ade85f..5f9c4dad73ed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -503,6 +503,9 @@ struct zone { * bootmem allocator): * managed_pages = present_pages - reserved_pages; * + * cma pages is present pages that are assigned for CMA use + * (MIGRATE_CMA). + * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used @@ -527,6 +530,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#ifdef CONFIG_CMA + unsigned long cma_pages; +#endif const char *name; @@ -624,6 +630,15 @@ static inline unsigned long zone_managed_pages(struct zone *zone) return (unsigned long)atomic_long_read(&zone->managed_pages); } +static inline unsigned long zone_cma_pages(struct zone *zone) +{ +#ifdef CONFIG_CMA + return zone->cma_pages; +#else + return 0; +#endif +} + static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ddccc59f2f72..3e4b29ee2b1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2168,6 +2168,7 @@ void __init init_cma_reserved_pageblock(struct page *page) } adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index a0e949542204..6cdf789ced5e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1637,14 +1637,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n high %lu" "\n spanned %lu" "\n present %lu" - "\n managed %lu", + "\n managed %lu" + "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone_managed_pages(zone)); + zone_managed_pages(zone), + zone_cma_pages(zone)); seq_printf(m, "\n protection: (%ld", From a052d4d13d88c2073d1339d9dce02cba7b4dc609 Mon Sep 17 00:00:00 2001 From: Patrick Daly Date: Thu, 25 Feb 2021 17:16:44 -0800 Subject: [PATCH 410/506] mm: cma: print region name on failure Print the name of the CMA region for convenience. This is useful information to have when cma_alloc() fails. [pdaly@codeaurora.org: print the "count" variable] Link: https://lkml.kernel.org/r/20210209142414.12768-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210208115200.20286-1-georgi.djakov@linaro.org Signed-off-by: Patrick Daly Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Reviewed-by: David Hildenbrand Reviewed-by: Randy Dunlap Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 23d4a97c834a..54eee2119822 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -500,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, count, ret); + pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } From 2bbd00aef0671bfe3c2ca5ba67097246257de125 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Feb 2021 17:16:47 -0800 Subject: [PATCH 411/506] mm: vmstat: fix NOHZ wakeups for node stat changes On NOHZ, the periodic vmstat flushers on each CPU can go to sleep and won't wake up until stat changes are detected in the per-cpu deltas of the zone vmstat counters. In commit 75ef71840539 ("mm, vmstat: add infrastructure for per-node vmstats") per-node counters were introduced, and subsequently most stats were moved from the zone to the node level. However, the node counters weren't added to the NOHZ wakeup detection. In theory this can cause per-cpu errors to remain in the user-reported stats indefinitely. In practice this only affects a handful of sub counters (file_mapped, dirty and writeback e.g.) because other page state changes at the node level likely involve a change at the zone level as well (alloc and free, lru ops). Also, nobody has complained. Fix it up for completeness: wake up vmstat refreshing on node changes. Also remove the BUILD_BUG_ONs that assert counter size; we haven't relied on it since we added sizeof() to the range calculation in commit 13c9aaf7fa01 ("mm/vmstat.c: fix NUMA statistics updates"). Link: https://lkml.kernel.org/r/20210202184342.118513-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6cdf789ced5e..0b0fc3b77789 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1894,16 +1894,12 @@ static void vmstat_update(struct work_struct *w) */ static bool need_update(int cpu) { + pg_data_t *last_pgdat = NULL; struct zone *zone; for_each_populated_zone(zone) { struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); - - BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); -#ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); -#endif - + struct per_cpu_nodestat *n; /* * The fast way of checking if there are any vmstat diffs. */ @@ -1915,6 +1911,13 @@ static bool need_update(int cpu) sizeof(p->vm_numa_stat_diff[0]))) return true; #endif + if (last_pgdat == zone->zone_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); + if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * + sizeof(n->vm_node_stat_diff[0]))) + return true; } return false; } From 629484ae73754243917e06d8d5e5f37c26e99399 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Feb 2021 17:16:51 -0800 Subject: [PATCH 412/506] mm: vmstat: add some comments on internal storage of byte items Byte-accounted items are used for slab object accounting at the cgroup level, because the objects in a slab page can belong to different cgroups. At the global level these items always change in multiples of whole slab pages. The vmstat code exploits this and stores these items as pages internally, which allows for more compact per-cpu data. This optimization isn't self-evident from the asserts and the division in the stat update functions. Provide the reader with some context. Link: https://lkml.kernel.org/r/20210202184411.118614-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 6 ++++++ mm/vmstat.c | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 773135fc6e19..506d625163a1 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -313,6 +313,12 @@ static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 0b0fc3b77789..e60b36f5f0a9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long t; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } @@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat, long o, n, t, z; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } From fbcc8183a4f815910697237386681153a05d9573 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Thu, 25 Feb 2021 17:16:54 -0800 Subject: [PATCH 413/506] mm/vmstat.c: erase latency in vmstat_shepherd Many 100us+ latencies have been deteceted in vmstat_shepherd() on CPX platform which has 208 logic cpus. And vmstat_shepherd is queued every second, which could make the case worse. Add schedule point in vmstat_shepherd() to erase the latency. Link: https://lkml.kernel.org/r/20210111035526.1511-1-benbjiang@tencent.com Signed-off-by: Jiang Biao Reported-by: Bin Lai Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index e60b36f5f0a9..74b2c374b86c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1980,6 +1980,8 @@ static void vmstat_shepherd(struct work_struct *w) if (!delayed_work_pending(dw) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); + + cond_resched(); } put_online_cpus(); From 9f605f260594f99b950062fd62244251e85dbd2b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:16:57 -0800 Subject: [PATCH 414/506] mm: move pfn_to_online_page() out of line Patch series "mm: Fix pfn_to_online_page() with respect to ZONE_DEVICE", v4. A pfn-walker that uses pfn_to_online_page() may inadvertently translate a pfn as online and in the page allocator, when it is offline managed by a ZONE_DEVICE mapping (details in Patch 3: ("mm: Teach pfn_to_online_page() about ZONE_DEVICE section collisions")). The 2 proposals under consideration are teach pfn_to_online_page() to be precise in the presence of mixed-zone sections, or teach the memory-add code to drop the System RAM associated with ZONE_DEVICE collisions. In order to not regress memory capacity by a few 10s to 100s of MiB the approach taken in this set is to add precision to pfn_to_online_page(). In the course of validating pfn_to_online_page() a couple other fixes fell out: 1/ soft_offline_page() fails to drop the reference taken in the madvise(..., MADV_SOFT_OFFLINE) case. 2/ memory_failure() uses get_dev_pagemap() to lookup ZONE_DEVICE pages, however that mapping may contain data pages and metadata raw pfns. Introduce pgmap_pfn_valid() to delineate the 2 types and fail the handling of raw metadata pfns. This patch (of 4); pfn_to_online_page() is already too large to be a macro or an inline function. In anticipation of further logic changes / growth, move it out of line. No functional change, just code movement. Link: https://lkml.kernel.org/r/161058499000.1840162.702316708443239771.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/161058499608.1840162.10165648147615238793.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Naoya Horiguchi Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 17 +---------------- mm/memory_hotplug.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 15acce5ab106..3d99de0db2dd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,22 +16,7 @@ struct resource; struct vmem_altmap; #ifdef CONFIG_MEMORY_HOTPLUG -/* - * Return page for the valid pfn only if the page is online. All pfn - * walkers which rely on the fully initialized page->flags and others - * should use this rather than pfn_valid && pfn_to_page - */ -#define pfn_to_online_page(pfn) \ -({ \ - struct page *___page = NULL; \ - unsigned long ___pfn = pfn; \ - unsigned long ___nr = pfn_to_section_nr(___pfn); \ - \ - if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \ - pfn_valid_within(___pfn)) \ - ___page = pfn_to_page(___pfn); \ - ___page; \ -}) +struct page *pfn_to_online_page(unsigned long pfn); /* * Types for free bootmem stored in page->lru.next. These have to be in diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index abe43c1ae920..fc6cdd99941b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -300,6 +300,22 @@ static int check_hotplug_memory_addressable(unsigned long pfn, return 0; } +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +struct page *pfn_to_online_page(unsigned long pfn) +{ + unsigned long nr = pfn_to_section_nr(pfn); + + if (nr < NR_MEM_SECTIONS && online_section_nr(nr) && + pfn_valid_within(pfn)) + return pfn_to_page(pfn); + return NULL; +} +EXPORT_SYMBOL_GPL(pfn_to_online_page); + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will From 9f9b02e5b3468e665a576a86ceb72f753001710b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:01 -0800 Subject: [PATCH 415/506] mm: teach pfn_to_online_page() to consider subsection validity pfn_to_online_page is primarily used to filter out offline or fully uninitialized pages. pfn_valid resp. online_section_nr have a coarse per memory section granularity. If a section shared with a partially offline memory (e.g. part of ZONE_DEVICE) then pfn_to_online_page would lead to a false positive on some pfns. Fix this by adding pfn_section_valid check which is subsection aware. [mhocko@kernel.org: changelog rewrite] Link: https://lkml.kernel.org/r/161058500148.1840162.4365921007820501696.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: b13bc35193d9 ("mm/hotplug: invalid PFNs from pfn_to_online_page()") Signed-off-by: Dan Williams Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Qian Cai Cc: Oscar Salvador Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc6cdd99941b..02378f11e2d6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -308,11 +308,26 @@ static int check_hotplug_memory_addressable(unsigned long pfn, struct page *pfn_to_online_page(unsigned long pfn) { unsigned long nr = pfn_to_section_nr(pfn); + struct mem_section *ms; - if (nr < NR_MEM_SECTIONS && online_section_nr(nr) && - pfn_valid_within(pfn)) - return pfn_to_page(pfn); - return NULL; + if (nr >= NR_MEM_SECTIONS) + return NULL; + + ms = __nr_to_section(nr); + if (!online_section(ms)) + return NULL; + + /* + * Save some code text when online_section() + + * pfn_section_valid() are sufficient. + */ + if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) + return NULL; + + if (!pfn_section_valid(ms, pfn)) + return NULL; + + return pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(pfn_to_online_page); From 1f90a3477df3ff1a91e064af554cdc887c8f9e5e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:05 -0800 Subject: [PATCH 416/506] mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions While pfn_to_online_page() is able to determine pfn_valid() at subsection granularity it is not able to reliably determine if a given pfn is also online if the section is mixes ZONE_{NORMAL,MOVABLE} with ZONE_DEVICE. This means that pfn_to_online_page() may return invalid @page objects. For example with a memory map like: 100000000-1fbffffff : System RAM 142000000-143002e16 : Kernel code 143200000-143713fff : Kernel rodata 143800000-143b15b7f : Kernel data 144227000-144ffffff : Kernel bss 1fc000000-2fbffffff : Persistent Memory (legacy) 1fc000000-2fbffffff : namespace0.0 This command: echo 0x1fc000000 > /sys/devices/system/memory/soft_offline_page ...succeeds when it should fail. When it succeeds it touches an uninitialized page and may crash or cause other damage (see dissolve_free_huge_page()). While the memory map above is contrived via the memmap=ss!nn kernel command line option, the collision happens in practice on shipping platforms. The memory controller resources that decode spans of physical address space are a limited resource. One technique platform-firmware uses to conserve those resources is to share a decoder across 2 devices to keep the address range contiguous. Unfortunately the unit of operation of a decoder is 64MiB while the Linux section size is 128MiB. This results in situations where, without subsection hotplug memory mappings with different lifetimes collide into one object that can only express one lifetime. Update move_pfn_range_to_zone() to flag (SECTION_TAINT_ZONE_DEVICE) a section that mixes ZONE_DEVICE pfns with other online pfns. With SECTION_TAINT_ZONE_DEVICE to delineate, pfn_to_online_page() can fall back to a slow-path check for ZONE_DEVICE pfns in an online section. In the fast path online_section() for a full ZONE_DEVICE section returns false. Because the collision case is rare, and for simplicity, the SECTION_TAINT_ZONE_DEVICE flag is never cleared once set. [dan.j.williams@intel.com: fix CONFIG_ZONE_DEVICE=n build] Link: https://lkml.kernel.org/r/CAPcyv4iX+7LAgAeSqx7Zw-Zd=ZV9gBv8Bo7oTbwCOOqJoZ3+Yg@mail.gmail.com Link: https://lkml.kernel.org/r/161058500675.1840162.7887862152161279354.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug") Signed-off-by: Dan Williams Reported-by: Michal Hocko Acked-by: Michal Hocko Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Naoya Horiguchi Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 34 +++++++++++++++++++++++++++------- mm/memory_hotplug.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f9c4dad73ed..47946cec7584 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -918,6 +918,18 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#ifdef CONFIG_ZONE_DEVICE +static inline bool zone_is_zone_device(struct zone *zone) +{ + return zone_idx(zone) == ZONE_DEVICE; +} +#else +static inline bool zone_is_zone_device(struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than @@ -1306,13 +1318,14 @@ extern size_t mem_section_usage_size(void); * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_MAP_LAST_BIT (1UL<<4) -#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_IS_ONLINE (1UL<<2) +#define SECTION_IS_EARLY (1UL<<3) +#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) +#define SECTION_MAP_LAST_BIT (1UL<<5) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) +#define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1351,6 +1364,13 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +static inline int online_device_section(struct mem_section *section) +{ + unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; + + return section && ((section->section_mem_map & flags) == flags); +} + static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 02378f11e2d6..3af4d3851d1a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -308,6 +308,7 @@ static int check_hotplug_memory_addressable(unsigned long pfn, struct page *pfn_to_online_page(unsigned long pfn) { unsigned long nr = pfn_to_section_nr(pfn); + struct dev_pagemap *pgmap; struct mem_section *ms; if (nr >= NR_MEM_SECTIONS) @@ -327,6 +328,22 @@ struct page *pfn_to_online_page(unsigned long pfn) if (!pfn_section_valid(ms, pfn)) return NULL; + if (!online_device_section(ms)) + return pfn_to_page(pfn); + + /* + * Slowpath: when ZONE_DEVICE collides with + * ZONE_{NORMAL,MOVABLE} within the same section some pfns in + * the section may be 'offline' but 'valid'. Only + * get_dev_pagemap() can determine sub-section online status. + */ + pgmap = get_dev_pagemap(pfn, NULL); + put_dev_pagemap(pgmap); + + /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ + if (pgmap) + return NULL; + return pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(pfn_to_online_page); @@ -709,6 +726,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } + +static void section_taint_zone_device(unsigned long pfn) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; +} + /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this @@ -738,6 +763,19 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, resize_pgdat_range(pgdat, start_pfn, nr_pages); pgdat_resize_unlock(pgdat, &flags); + /* + * Subsection population requires care in pfn_to_online_page(). + * Set the taint to enable the slow path detection of + * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} + * section. + */ + if (zone_is_zone_device(zone)) { + if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn); + if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn + nr_pages); + } + /* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_flags_mask From 34dc45be4563f344d59ba0428416d0d265aa4f4d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Feb 2021 17:17:08 -0800 Subject: [PATCH 417/506] mm: fix memory_failure() handling of dax-namespace metadata Given 'struct dev_pagemap' spans both data pages and metadata pages be careful to consult the altmap if present to delineate metadata. In fact the pfn_first() helper already identifies the first valid data pfn, so export that helper for other code paths via pgmap_pfn_valid(). Other usage of get_dev_pagemap() are not a concern because those are operating on known data pfns having been looked up by get_user_pages(). I.e. metadata pfns are never user mapped. Link: https://lkml.kernel.org/r/161058501758.1840162.4239831989762604527.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 6100e34b2526 ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages") Signed-off-by: Dan Williams Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Oscar Salvador Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 6 ++++++ mm/memory-failure.c | 6 ++++++ mm/memremap.c | 15 +++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 79c49e7f5c30..f5b464daeeca 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -137,6 +137,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); @@ -165,6 +166,11 @@ static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return NULL; } +static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + return false; +} + static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 55c671904aac..24210c9bd843 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1312,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, */ put_page(page); + /* device metadata space is not recoverable */ + if (!pgmap_pfn_valid(pgmap, pfn)) { + rc = -ENXIO; + goto out; + } + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by diff --git a/mm/memremap.c b/mm/memremap.c index 16b2fb482da1..2455bac89506 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + int i; + + for (i = 0; i < pgmap->nr_range; i++) { + struct range *range = &pgmap->ranges[i]; + + if (pfn >= PHYS_PFN(range->start) && + pfn <= PHYS_PFN(range->end)) + return pfn >= pfn_first(pgmap, i); + } + + return false; +} + static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { const struct range *range = &pgmap->ranges[range_id]; From 1adf8b468ff6bc64ba01ce3848da4bcf409215b4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:13 -0800 Subject: [PATCH 418/506] mm/memory_hotplug: rename all existing 'memhp' into 'mhp' This renames all 'memhp' instances to 'mhp' except for memhp_default_state for being a kernel command line option. This is just a clean up and should not cause a functional change. Let's make it consistent rater than mixing the two prefixes. In preparation for more users of the 'mhp' terminology. Link: https://lkml.kernel.org/r/1611554093-27316-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 +++++----- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index eef4ffb6122c..901e379676be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", }; -int memhp_online_type_from_str(const char *str) +int mhp_online_type_from_str(const char *str) { int i; @@ -253,7 +253,7 @@ static int memory_subsys_offline(struct device *dev) static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - const int online_type = memhp_online_type_from_str(buf); + const int online_type = mhp_online_type_from_str(buf); struct memory_block *mem = to_memory_block(dev); int ret; @@ -387,19 +387,19 @@ static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", - online_type_to_str[memhp_default_online_type]); + online_type_to_str[mhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - const int online_type = memhp_online_type_from_str(buf); + const int online_type = mhp_online_type_from_str(buf); if (online_type < 0) return -EINVAL; - memhp_default_online_type = online_type; + mhp_default_online_type = online_type; return count; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3d99de0db2dd..ca5e8d137726 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -116,10 +116,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params); extern u64 max_mem_size; -extern int memhp_online_type_from_str(const char *str); +extern int mhp_online_type_from_str(const char *str); /* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int memhp_default_online_type; +extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3af4d3851d1a..ac1c686a5989 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,17 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int memhp_default_online_type = MMOP_OFFLINE; +int mhp_default_online_type = MMOP_OFFLINE; #else -int memhp_default_online_type = MMOP_ONLINE; +int mhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { - const int online_type = memhp_online_type_from_str(str); + const int online_type = mhp_online_type_from_str(str); if (online_type >= 0) - memhp_default_online_type = online_type; + mhp_default_online_type = online_type; return 1; } @@ -1076,7 +1076,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = memhp_default_online_type; + mem->online_type = mhp_default_online_type; return device_online(&mem->dev); } @@ -1157,7 +1157,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (memhp_default_online_type != MMOP_OFFLINE) + if (mhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; From 26011267e1a7ddaab50b5f81b402ca3e7fc2887c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:17 -0800 Subject: [PATCH 419/506] mm/memory_hotplug: MEMHP_MERGE_RESOURCE -> MHP_MERGE_RESOURCE Let's make "MEMHP_MERGE_RESOURCE" consistent with "MHP_NONE", "mhp_t" and "mhp_flags". As discussed recently [1], "mhp" is our internal acronym for memory hotplug now. [1] https://lore.kernel.org/linux-mm/c37de2d0-28a1-4f7d-f944-cfd7d81c334d@redhat.com/ Link: https://lkml.kernel.org/r/20210126115829.10909-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Miaohe Lin Acked-by: Michael S. Tsirkin Reviewed-by: Oscar Salvador Acked-by: Wei Liu Reviewed-by: Pankaj Gupta Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Jason Wang Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Michal Hocko Cc: Anshuman Khandual Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 2 +- drivers/xen/balloon.c | 2 +- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 8c471823a5af..2f776d78e3c1 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); + (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 9fc9ec4a25f5..d44e43869f17 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -623,7 +623,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, /* Memory might get onlined immediately. */ atomic64_add(size, &vm->offline_size); rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, - MEMHP_MERGE_RESOURCE); + MHP_MERGE_RESOURCE); if (rc) { atomic64_sub(size, &vm->offline_size); dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b57b2067ecbf..671c71245a7b 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ca5e8d137726..08eeef679ab7 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -53,7 +53,7 @@ typedef int __bitwise mhp_t; * with this flag set, the resource pointer must no longer be used as it * might be stale, or the resource might have changed. */ -#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* * Extended parameters for memory hotplug: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ac1c686a5989..6a02c3f42717 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1153,7 +1153,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * In case we're allowed to merge the resource, flag it and trigger * merging now that adding succeeded. */ - if (mhp_flags & MEMHP_MERGE_RESOURCE) + if (mhp_flags & MHP_MERGE_RESOURCE) merge_system_ram_resource(res); /* online pages if requested */ From 6c922cf75115c8b389c091a073209ca45f1af530 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:21 -0800 Subject: [PATCH 420/506] mm/memory_hotplug: use helper function zone_end_pfn() to get end_pfn Commit 108bcc96ef70 ("mm: add & use zone_end_pfn() and zone_spans_pfn()") introduced the helper zone_end_pfn() to calculate the zone end pfn. But update_pgdat_span() forgot to use it. Use this helper and rename local variable zone_end_pfn to end_pfn to avoid a naming conflict with the existing zone_end_pfn(). Link: https://lkml.kernel.org/r/20210127093211.37714-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a02c3f42717..a969463bdda4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -493,20 +493,19 @@ static void update_pgdat_span(struct pglist_data *pgdat) for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { - unsigned long zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); /* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) { node_start_pfn = zone->zone_start_pfn; - node_end_pfn = zone_end_pfn; + node_end_pfn = end_pfn; continue; } - if (zone_end_pfn > node_end_pfn) - node_end_pfn = zone_end_pfn; + if (end_pfn > node_end_pfn) + node_end_pfn = end_pfn; if (zone->zone_start_pfn < node_start_pfn) node_start_pfn = zone->zone_start_pfn; } From e9a2e48e8704c9d20a625c6f2357147d03ea7b97 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:24 -0800 Subject: [PATCH 421/506] drivers/base/memory: don't store phys_device in memory blocks No need to store the value for each and every memory block, as we can easily query the value at runtime. Reshuffle the members to optimize the memory layout. Also, let's clarify what the interface once was used for and why it's legacy nowadays. "phys_device" was used on s390x in older versions of lsmem[2]/chmem[3], back when they were still part of s390x-tools. They were later replaced by the variants in linux-utils. For example, RHEL6 and RHEL7 contain lsmem/chmem from s390-utils. RHEL8 switched to versions from util-linux on s390x [4]. "phys_device" was added with sysfs support for memory hotplug in commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") in 2005. It always returned 0. s390x started returning something != 0 on some setups (if sclp.rzm is set by HW) in 2010 via commit 57b552ba0b2f ("memory hotplug/s390: set phys_device"). For s390x, it allowed for identifying which memory block devices belong to the same storage increment (RZM). Only if all memory block devices comprising a single storage increment were offline, the memory could actually be removed in the hypervisor. Since commit e5d709bb5fb7 ("s390/memory hotplug: provide memory_block_size_bytes() function") in 2013 a memory block device spans at least one storage increment - which is why the interface isn't really helpful/used anymore (except by old lsmem/chmem tools). There were once RFC patches to make use of "phys_device" in ACPI context; however, the underlying problem could be solved using different interfaces [1]. [1] https://patchwork.kernel.org/patch/2163871/ [2] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/lsmem [3] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/chmem [4] https://bugzilla.redhat.com/show_bug.cgi?id=1504134 Link: https://lkml.kernel.org/r/20210201181347.13262-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Gerald Schaefer Cc: Jonathan Corbet Cc: "Rafael J. Wysocki" Cc: Mauro Carvalho Chehab Cc: Ilya Dryomov Cc: Vaibhav Jain Cc: Tom Rix Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-devices-memory | 5 ++-- .../admin-guide/mm/memory-hotplug.rst | 4 +-- drivers/base/memory.c | 25 +++++++------------ include/linux/memory.h | 3 +-- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 246a45b96d22..58dbc592bc57 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -26,8 +26,9 @@ Date: September 2008 Contact: Badari Pulavarty Description: The file /sys/devices/system/memory/memoryX/phys_device - is read-only and is designed to show the name of physical - memory device. Implementation is currently incomplete. + is read-only; it is a legacy interface only ever used on s390x + to expose the covered storage increment. +Users: Legacy s390-tools lsmem/chmem What: /sys/devices/system/memory/memoryX/phys_index Date: September 2008 diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 5c4432c96c4b..245739f55ac7 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -160,8 +160,8 @@ Under each memory block, you can see 5 files: "online_movable", "online", "offline" command which will be performed on all sections in the block. -``phys_device`` read-only: designed to show the name of physical memory - device. This is not well implemented now. +``phys_device`` read-only: legacy interface only ever used on s390x to + expose the covered storage increment. ``removable`` read-only: contains an integer value indicating whether the memory block is removable or not removable. A value of 1 indicates that the memory diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 901e379676be..f35298425575 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -290,20 +290,20 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, } /* - * phys_device is a bad name for this. What I really want - * is a way to differentiate between memory ranges that - * are part of physical devices that constitute - * a complete removable unit or fru. - * i.e. do these ranges belong to the same physical device, - * s.t. if I offline all of these sections I can then - * remove the physical device? + * Legacy interface that we cannot remove: s390x exposes the storage increment + * covered by a memory block, allowing for identifying which memory blocks + * comprise a storage increment. Since a memory block spans complete + * storage increments nowadays, this interface is basically unused. Other + * archs never exposed != 0. */ static ssize_t phys_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); - return sysfs_emit(buf, "%d\n", mem->phys_device); + return sysfs_emit(buf, "%d\n", + arch_get_memory_phys_device(start_pfn)); } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -488,11 +488,7 @@ static DEVICE_ATTR_WO(soft_offline_page); static DEVICE_ATTR_WO(hard_offline_page); #endif -/* - * Note that phys_device is optional. It is here to allow for - * differentiation between which *physical* devices each - * section belongs to... - */ +/* See phys_device_show(). */ int __weak arch_get_memory_phys_device(unsigned long start_pfn) { return 0; @@ -574,7 +570,6 @@ int register_memory(struct memory_block *memory) static int init_memory_block(unsigned long block_id, unsigned long state) { struct memory_block *mem; - unsigned long start_pfn; int ret = 0; mem = find_memory_block_by_id(block_id); @@ -588,8 +583,6 @@ static int init_memory_block(unsigned long block_id, unsigned long state) mem->start_section_nr = block_id * sections_per_block; mem->state = state; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - mem->phys_device = arch_get_memory_phys_device(start_pfn); mem->nid = NUMA_NO_NODE; ret = register_memory(mem); diff --git a/include/linux/memory.h b/include/linux/memory.h index 439a89e758d8..4da95e684e20 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,9 +27,8 @@ struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ - int phys_device; /* to which fru does this belong? */ - struct device dev; int nid; /* NID for this memory block */ + struct device dev; }; int arch_get_memory_phys_device(unsigned long start_pfn); From a89107c0478137115c6647aa28caef75513b9f40 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:28 -0800 Subject: [PATCH 422/506] Documentation: sysfs/memory: clarify some memory block device properties In commit 53cdc1cb29e8 ("drivers/base/memory.c: indicate all memory blocks as removable") we changed the output of the "removable" property of memory devices to return "1" if and only if the kernel supports memory offlining. Let's update documentation, stating that the interface is legacy. Also update documentation of the "state" property and "valid_zones" properties. Link: https://lkml.kernel.org/r/20210201181347.13262-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Dave Hansen Cc: Jonathan Corbet Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jonathan Cameron Cc: Ilya Dryomov Cc: Mauro Carvalho Chehab Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-devices-memory | 51 ++++++++++++------- .../admin-guide/mm/memory-hotplug.rst | 16 +++--- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 58dbc592bc57..d8b0f80b9e33 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -13,13 +13,13 @@ What: /sys/devices/system/memory/memoryX/removable Date: June 2008 Contact: Badari Pulavarty Description: - The file /sys/devices/system/memory/memoryX/removable - indicates whether this memory block is removable or not. - This is useful for a user-level agent to determine - identify removable sections of the memory before attempting - potentially expensive hot-remove memory operation + The file /sys/devices/system/memory/memoryX/removable is a + legacy interface used to indicated whether a memory block is + likely to be offlineable or not. Newer kernel versions return + "1" if and only if the kernel supports memory offlining. Users: hotplug memory remove tools http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils + lsmem/chmem part of util-linux What: /sys/devices/system/memory/memoryX/phys_device Date: September 2008 @@ -44,23 +44,25 @@ Date: September 2008 Contact: Badari Pulavarty Description: The file /sys/devices/system/memory/memoryX/state - is read-write. When read, its contents show the - online/offline state of the memory section. When written, - root can toggle the the online/offline state of a removable - memory section (see removable file description above) - using the following commands:: + is read-write. When read, it returns the online/offline + state of the memory block. When written, root can toggle + the online/offline state of a memory block using the following + commands:: # echo online > /sys/devices/system/memory/memoryX/state # echo offline > /sys/devices/system/memory/memoryX/state - For example, if /sys/devices/system/memory/memory22/removable - contains a value of 1 and - /sys/devices/system/memory/memory22/state contains the - string "online" the following command can be executed by - by root to offline that section:: - - # echo offline > /sys/devices/system/memory/memory22/state + On newer kernel versions, advanced states can be specified + when onlining to select a target zone: "online_movable" + selects the movable zone. "online_kernel" selects the + applicable kernel zone (DMA, DMA32, or Normal). However, + after successfully setting one of the advanced states, + reading the file will return "online"; the zone information + can be obtained via "valid_zones" instead. + While onlining is unlikely to fail, there are no guarantees + that offlining will succeed. Offlining is more likely to + succeed if "valid_zones" indicates "Movable". Users: hotplug memory remove tools http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils @@ -70,8 +72,19 @@ Date: July 2014 Contact: Zhang Zhen Description: The file /sys/devices/system/memory/memoryX/valid_zones is - read-only and is designed to show which zone this memory - block can be onlined to. + read-only. + + For online memory blocks, it returns in which zone memory + provided by a memory block is managed. If multiple zones + apply (not applicable for hotplugged memory), "None" is returned + and the memory block cannot be offlined. + + For offline memory blocks, it returns by which zone memory + provided by a memory block can be managed when onlining. + The first returned zone ("default") will be used when setting + the state of an offline memory block to "online". Only one of + the kernel zones (DMA, DMA32, Normal) is applicable for a single + memory block. What: /sys/devices/system/memoryX/nodeY Date: October 2009 diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 245739f55ac7..5307f90738aa 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -162,14 +162,14 @@ Under each memory block, you can see 5 files: which will be performed on all sections in the block. ``phys_device`` read-only: legacy interface only ever used on s390x to expose the covered storage increment. -``removable`` read-only: contains an integer value indicating - whether the memory block is removable or not - removable. A value of 1 indicates that the memory - block is removable and a value of 0 indicates that - it is not removable. A memory block is removable only if - every section in the block is removable. -``valid_zones`` read-only: designed to show which zones this memory block - can be onlined to. +``removable`` read-only: legacy interface that indicated whether a memory + block was likely to be offlineable or not. Newer kernel + versions return "1" if and only if the kernel supports + memory offlining. +``valid_zones`` read-only: designed to show by which zone memory provided by + a memory block is managed, and to show by which zone memory + provided by an offline memory block could be managed when + onlining. The first column shows it`s default zone. From bca3feaa0764ab5a4cbe6817871601f1d00c059d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:33 -0800 Subject: [PATCH 423/506] mm/memory_hotplug: prevalidate the address range being added with platform Patch series "mm/memory_hotplug: Pre-validate the address range with platform", v5. This series adds a mechanism allowing platforms to weigh in and prevalidate incoming address range before proceeding further with the memory hotplug. This helps prevent potential platform errors for the given address range, down the hotplug call chain, which inevitably fails the hotplug itself. This mechanism was suggested by David Hildenbrand during another discussion with respect to a memory hotplug fix on arm64 platform. https://lore.kernel.org/linux-arm-kernel/1600332402-30123-1-git-send-email-anshuman.khandual@arm.com/ This mechanism focuses on the addressibility aspect and not [sub] section alignment aspect. Hence check_hotplug_memory_range() and check_pfn_span() have been left unchanged. This patch (of 4): This introduces mhp_range_allowed() which can be called in various memory hotplug paths to prevalidate the address range which is being added, with the platform. Then mhp_range_allowed() calls mhp_get_pluggable_range() which provides applicable address range depending on whether linear mapping is required or not. For ranges that require linear mapping, it calls a new arch callback arch_get_mappable_range() which the platform can override. So the new callback, in turn provides the platform an opportunity to configure acceptable memory hotplug address ranges in case there are constraints. This mechanism will help prevent platform specific errors deep down during hotplug calls. This drops now redundant check_hotplug_memory_addressable() check in __add_pages() but instead adds a VM_BUG_ON() check which would ensure that the range has been validated with mhp_range_allowed() earlier in the call chain. Besides mhp_get_pluggable_range() also can be used by potential memory hotplug callers to avail the allowed physical range which would go through on a given platform. This does not really add any new range check in generic memory hotplug but instead compensates for lost checks in arch_add_memory() where applicable and check_hotplug_memory_addressable(), with unified mhp_range_allowed(). [akpm@linux-foundation.org: make pagemap_range() return -EINVAL when mhp_range_allowed() fails] Link: https://lkml.kernel.org/r/1612149902-7867-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1612149902-7867-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Heiko Carstens Cc: Catalin Marinas Cc: Vasily Gorbik # s390 Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Jason Wang Cc: Jonathan Cameron Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 10 +++++ mm/memory_hotplug.c | 78 +++++++++++++++++++++++++--------- mm/memremap.c | 8 +++- 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 08eeef679ab7..7288aa5ef73b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -66,6 +66,9 @@ struct mhp_params { pgprot_t pgprot; }; +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); +struct range mhp_get_pluggable_range(bool need_mapping); + /* * Zone resizing functions * @@ -266,6 +269,13 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some + * platforms might override and use arch_get_mappable_range() + * for internal non memory hotplug purposes. + */ +struct range arch_get_mappable_range(void); + #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a969463bdda4..5ba51a8bdaeb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size, if (strcmp(resource_name, "System RAM")) flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; + if (!mhp_range_allowed(start, size, true)) + return ERR_PTR(-E2BIG); + /* * Make sure value parsed from 'mem=' only restricts memory adding * while booting, so that memory hotplug won't be impacted. Please @@ -284,22 +287,6 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, return 0; } -static int check_hotplug_memory_addressable(unsigned long pfn, - unsigned long nr_pages) -{ - const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; - - if (max_addr >> MAX_PHYSMEM_BITS) { - const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; - WARN(1, - "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", - (u64)PFN_PHYS(pfn), max_addr, max_allowed); - return -E2BIG; - } - - return 0; -} - /* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others @@ -365,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, if (WARN_ON_ONCE(!params->pgprot.pgprot)) return -EINVAL; - err = check_hotplug_memory_addressable(pfn, nr_pages); - if (err) - return err; + VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); if (altmap) { /* @@ -1248,6 +1233,61 @@ int add_memory_driver_managed(int nid, u64 start, u64 size, } EXPORT_SYMBOL_GPL(add_memory_driver_managed); +/* + * Platforms should define arch_get_mappable_range() that provides + * maximum possible addressable physical memory range for which the + * linear mapping could be created. The platform returned address + * range must adhere to these following semantics. + * + * - range.start <= range.end + * - Range includes both end points [range.start..range.end] + * + * There is also a fallback definition provided here, allowing the + * entire possible physical address range in case any platform does + * not define arch_get_mappable_range(). + */ +struct range __weak arch_get_mappable_range(void) +{ + struct range mhp_range = { + .start = 0UL, + .end = -1ULL, + }; + return mhp_range; +} + +struct range mhp_get_pluggable_range(bool need_mapping) +{ + const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + struct range mhp_range; + + if (need_mapping) { + mhp_range = arch_get_mappable_range(); + if (mhp_range.start > max_phys) { + mhp_range.start = 0; + mhp_range.end = 0; + } + mhp_range.end = min_t(u64, mhp_range.end, max_phys); + } else { + mhp_range.start = 0; + mhp_range.end = max_phys; + } + return mhp_range; +} +EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); + +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) +{ + struct range mhp_range = mhp_get_pluggable_range(need_mapping); + u64 end = start + size; + + if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) + return true; + + pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", + start, end, mhp_range.start, mhp_range.end); + return false; +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping diff --git a/mm/memremap.c b/mm/memremap.c index 2455bac89506..7aa7d6e80ee5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -200,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, int range_id, int nid) { + const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE; struct range *range = &pgmap->ranges[range_id]; struct dev_pagemap *conflict_pgmap; int error, is_ram; @@ -245,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (error) goto err_pfn_remap; + if (!mhp_range_allowed(range->start, range_len(range), !is_private)) { + error = -EINVAL; + goto err_pfn_remap; + } + mem_hotplug_begin(); /* @@ -258,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, * the CPU, we do want the linear mapping and thus use * arch_add_memory(). */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + if (is_private) { error = add_pages(nid, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params); } else { From 03aaf83fba6e5af08b5dd174c72edee9b7d9ed9b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:37 -0800 Subject: [PATCH 424/506] arm64/mm: define arch_get_mappable_range() This overrides arch_get_mappable_range() on arm64 platform which will be used with recently added generic framework. It drops inside_linear_region() and subsequent check in arch_add_memory() which are no longer required. It also adds a VM_BUG_ON() check that would ensure that mhp_range_allowed() has already been called. Link: https://lkml.kernel.org/r/1612149902-7867-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Heiko Carstens Cc: Jason Wang Cc: Jonathan Cameron Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Vasily Gorbik Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 25af183e4bed..d0758d24a42d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1444,16 +1444,19 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); } -static bool inside_linear_region(u64 start, u64 size) +struct range arch_get_mappable_range(void) { + struct range mhp_range; + /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] * accommodating both its ends but excluding PAGE_END. Max physical * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - return start >= __pa(_PAGE_OFFSET(vabits_actual)) && - (start + size - 1) <= __pa(PAGE_END - 1); + mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); + mhp_range.end = __pa(PAGE_END - 1); + return mhp_range; } int arch_add_memory(int nid, u64 start, u64 size, @@ -1461,11 +1464,7 @@ int arch_add_memory(int nid, u64 start, u64 size, { int ret, flags = 0; - if (!inside_linear_region(start, size)) { - pr_err("[%llx %llx] is outside linear mapping region\n", start, start + size); - return -EINVAL; - } - + VM_BUG_ON(!mhp_range_allowed(start, size, true)); if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; From 7707248a4727c4e8ee8d84ed578a9807d8994a40 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 25 Feb 2021 17:17:41 -0800 Subject: [PATCH 425/506] s390/mm: define arch_get_mappable_range() This overrides arch_get_mappabble_range() on s390 platform which will be used with recently added generic framework. It modifies the existing range check in vmem_add_mapping() using arch_get_mappable_range(). It also adds a VM_BUG_ON() check that would ensure that mhp_range_allowed() has already been called on the hotplug path. Link: https://lkml.kernel.org/r/1612149902-7867-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Heiko Carstens Reviewed-by: David Hildenbrand Cc: Vasily Gorbik Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jason Wang Cc: Jonathan Cameron Cc: Mark Rutland Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: teawater Cc: Wei Yang Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/init.c | 1 + arch/s390/mm/vmem.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 73a163065b95..0e76b2127dc6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -297,6 +297,7 @@ int arch_add_memory(int nid, u64 start, u64 size, if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; + VM_BUG_ON(!mhp_range_allowed(start, size, true)); rc = vmem_add_mapping(start, size); if (rc) return rc; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 01f3a5f58e64..82dbf9450105 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,6 +4,7 @@ * Author(s): Heiko Carstens */ +#include #include #include #include @@ -532,11 +533,22 @@ void vmem_remove_mapping(unsigned long start, unsigned long size) mutex_unlock(&vmem_mutex); } +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = VMEM_MAX_PHYS - 1; + return mhp_range; +} + int vmem_add_mapping(unsigned long start, unsigned long size) { + struct range range = arch_get_mappable_range(); int ret; - if (start + size > VMEM_MAX_PHYS || + if (start < range.start || + start + size > range.end + 1 || start + size < start) return -ERANGE; From 94c8945376d44b37aa3ab5b58669a2a86326968e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Feb 2021 17:17:45 -0800 Subject: [PATCH 426/506] virtio-mem: check against mhp_get_pluggable_range() which memory we can hotplug Right now, we only check against MAX_PHYSMEM_BITS - but turns out there are more restrictions of which memory we can actually hotplug, especially om arm64 or s390x once we support them: we might receive something like -E2BIG or -ERANGE from add_memory_driver_managed(), stopping device operation. So, check right when initializing the device which memory we can add, warning the user. Try only adding actually pluggable ranges: in the worst case, no memory provided by our device is pluggable. In the usual case, we expect all device memory to be pluggable, and in corner cases only some memory at the end of the device-managed memory region to not be pluggable. Link: https://lkml.kernel.org/r/1612149902-7867-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: David Hildenbrand Signed-off-by: Anshuman Khandual Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Oscar Salvador Cc: Wei Yang Cc: teawater Cc: Anshuman Khandual Cc: Pankaj Gupta Cc: Jonathan Cameron Cc: Vasily Gorbik Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Heiko Carstens Cc: Michal Hocko Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_mem.c | 41 ++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index d44e43869f17..1119e0c6f6c1 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2222,7 +2222,7 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) */ static void virtio_mem_refresh_config(struct virtio_mem *vm) { - const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + const struct range pluggable_range = mhp_get_pluggable_range(true); uint64_t new_plugged_size, usable_region_size, end_addr; /* the plugged_size is just a reflection of what _we_ did previously */ @@ -2234,15 +2234,25 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) /* calculate the last usable memory block id */ virtio_cread_le(vm->vdev, struct virtio_mem_config, usable_region_size, &usable_region_size); - end_addr = vm->addr + usable_region_size; - end_addr = min(end_addr, phys_limit); + end_addr = min(vm->addr + usable_region_size - 1, + pluggable_range.end); - if (vm->in_sbm) - vm->sbm.last_usable_mb_id = - virtio_mem_phys_to_mb_id(end_addr) - 1; - else - vm->bbm.last_usable_bb_id = - virtio_mem_phys_to_bb_id(vm, end_addr) - 1; + if (vm->in_sbm) { + vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); + if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) + vm->sbm.last_usable_mb_id--; + } else { + vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, + end_addr); + if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) + vm->bbm.last_usable_bb_id--; + } + /* + * If we cannot plug any of our device memory (e.g., nothing in the + * usable region is addressable), the last usable memory block id will + * be smaller than the first usable memory block id. We'll stop + * attempting to add memory with -ENOSPC from our main loop. + */ /* see if there is a request to change the size */ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, @@ -2364,7 +2374,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm) static int virtio_mem_init(struct virtio_mem *vm) { - const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + const struct range pluggable_range = mhp_get_pluggable_range(true); uint64_t sb_size, addr; uint16_t node_id; @@ -2405,9 +2415,10 @@ static int virtio_mem_init(struct virtio_mem *vm) if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) dev_warn(&vm->vdev->dev, "The alignment of the physical end address can make some memory unusable.\n"); - if (vm->addr + vm->region_size > phys_limit) + if (vm->addr < pluggable_range.start || + vm->addr + vm->region_size - 1 > pluggable_range.end) dev_warn(&vm->vdev->dev, - "Some memory is not addressable. This can make some memory unusable.\n"); + "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); /* * We want subblocks to span at least MAX_ORDER_NR_PAGES and @@ -2429,7 +2440,8 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->sbm.sb_size; /* Round up to the next full memory block */ - addr = vm->addr + memory_block_size_bytes() - 1; + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + memory_block_size_bytes() - 1; vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); vm->sbm.next_mb_id = vm->sbm.first_mb_id; } else { @@ -2450,7 +2462,8 @@ static int virtio_mem_init(struct virtio_mem *vm) } /* Round up to the next aligned big block */ - addr = vm->addr + vm->bbm.bb_size - 1; + addr = max_t(uint64_t, vm->addr, pluggable_range.start) + + vm->bbm.bb_size - 1; vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); vm->bbm.next_bb_id = vm->bbm.first_bb_id; } From 48b03eea321c85185d173cb0d112698b79b1c98e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:49 -0800 Subject: [PATCH 427/506] mm/mlock: stop counting mlocked pages when none vma is found There will be no vma satisfies addr < vm_end when find_vma() returns NULL. Thus it's meaningless to traverse the vma list below because we can't find any vma to count mlocked pages. Stop counting mlocked pages in this case to save some vma list traversal cycles. Link: https://lkml.kernel.org/r/20210204110705.17586-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mlock.c b/mm/mlock.c index 73960bb3464d..f8f8cc32d03d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -622,7 +622,7 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, vma = find_vma(mm, start); if (vma == NULL) - vma = mm->mmap; + return 0; for (; vma ; vma = vma->vm_next) { if (start >= vma->vm_end) From aaf1f990aee40bc74b425ef8f51201ae21b85ed7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:53 -0800 Subject: [PATCH 428/506] mm/rmap: correct some obsolete comments of anon_vma commit 2b575eb64f7a ("mm: convert anon_vma->lock to a mutex") changed spinlock used to serialize access to vma list to mutex. And further, the commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") converted the mutex to an rwsem for solving scalability problem. So replace spinlock with rwsem to make comment uptodate. Link: https://lkml.kernel.org/r/20210123072459.25903-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index e26ae119a131..f6f43620cd97 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in page_lock_anon_vma_read() - * and that may actually touch the spinlock even in the newly + * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * @@ -359,7 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) goto out_error_free_anon_vma; /* - * The root anon_vma's spinlock is the lock actually used when we + * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; From e0af87ff7afcde2660be44302836d2d5618185af Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:56 -0800 Subject: [PATCH 429/506] mm/rmap: remove unneeded semicolon in page_not_mapped() Remove extra semicolon without any functional change intended. Link: https://lkml.kernel.org/r/20210127093425.39640-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index f6f43620cd97..46fdbf541b8e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1784,7 +1784,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) static int page_not_mapped(struct page *page) { return !page_mapped(page); -}; +} /** * try_to_munlock - try to munlock a page From 90aaca852ca13a6c962b25964fb6678120f266b1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:59 -0800 Subject: [PATCH 430/506] mm/rmap: fix obsolete comment in __page_check_anon_rmap() Commit 21333b2b66b8 ("ksm: no debug in page_dup_rmap()") has reverted page_dup_rmap() to an inline atomic_inc of mapcount. So page_dup_rmap() does not call __page_check_anon_rmap() anymore. Link: https://lkml.kernel.org/r/20210128110209.50857-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 46fdbf541b8e..c3f6e060d73f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1086,8 +1086,7 @@ static void __page_check_anon_rmap(struct page *page, * be set up correctly at this point. * * We have exclusion against page_add_anon_rmap because the caller - * always holds the page locked, except if called from page_dup_rmap, - * in which case the page is already known to be setup. + * always holds the page locked. * * We have exclusion against page_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked From b7e188ec98b1644ff70a6d3624ea16aadc39f5e0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:03 -0800 Subject: [PATCH 431/506] mm/rmap: use page_not_mapped in try_to_unmap() page_mapcount_is_zero() calculates accurately how many mappings a hugepage has in order to check against 0 only. This is a waste of cpu time. We can do this via page_not_mapped() to save some possible atomic_read cycles. Remove the function page_mapcount_is_zero() as it's not used anymore and move page_not_mapped() above try_to_unmap() to avoid identifier undeclared compilation error. Link: https://lkml.kernel.org/r/20210130084904.35307-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index c3f6e060d73f..b49e85605f8f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1736,9 +1736,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_mapcount_is_zero(struct page *page) +static int page_not_mapped(struct page *page) { - return !total_mapcount(page); + return !page_mapped(page); } /** @@ -1756,7 +1756,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_mapcount_is_zero, + .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1780,11 +1780,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) return !page_mapcount(page) ? true : false; } -static int page_not_mapped(struct page *page) -{ - return !page_mapped(page); -} - /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked From ad8a20cf6d19a9506b4a554030bafc1ac204ef31 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:06 -0800 Subject: [PATCH 432/506] mm/rmap: correct obsolete comment of page_get_anon_vma() Since commit 746b18d421da ("mm: use refcounts for page_lock_anon_vma()"), page_lock_anon_vma() is renamed to page_get_anon_vma() and converted to return a refcount increased anon_vma. But it forgot to change the relevant comment. Link: https://lkml.kernel.org/r/20210203093215.31990-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index b49e85605f8f..b0fc27e77d6d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -462,8 +462,8 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against page_remove_rmap() - * the best this function can do is return a locked anon_vma that might - * have been relevant to this page. + * the best this function can do is return a refcount increased anon_vma + * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). From 5d5d19eda6b0ee790af89c45e3f678345be6f50f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:09 -0800 Subject: [PATCH 433/506] mm/rmap: fix potential pte_unmap on an not mapped pte For PMD-mapped page (usually THP), pvmw->pte is NULL. For PTE-mapped THP, pvmw->pte is mapped. But for HugeTLB pages, pvmw->pte is not mapped and set to the relevant page table entry. So in page_vma_mapped_walk_done(), we may do pte_unmap() for HugeTLB pte which is not mapped. Fix this by checking pvmw->page against PageHuge before trying to do pte_unmap(). Link: https://lkml.kernel.org/r/20210127093349.39081-1-linmiaohe@huawei.com Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") Signed-off-by: Hongxiang Lou Signed-off-by: Miaohe Lin Tested-by: Sedat Dilek Cc: Kees Cook Cc: Nathan Chancellor Cc: Mike Kravetz Cc: Shakeel Butt Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Nick Desaulniers Cc: "Kirill A. Shutemov" Cc: Wei Yang Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Brian Geffon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 70085ca1a3fc..def5c62c93b3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -213,7 +213,8 @@ struct page_vma_mapped_walk { static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { - if (pvmw->pte) + /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ + if (pvmw->pte && !PageHuge(pvmw->page)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); From c0c641d77b9ab0da798ca86d34d2327d6f427f4c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:18:13 -0800 Subject: [PATCH 434/506] mm: zswap: clean up confusing comment Correct wording and change one duplicated word (it) to "it is". Link: https://lkml.kernel.org/r/20201221042848.13980-1-rdunlap@infradead.org Fixes: 0ab0abcf5115 ("mm/zswap: refactor the get/put routines") Signed-off-by: Randy Dunlap Cc: Weijie Yang Cc: Seth Jennings Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 182f6ad5aa69..1e41c2857068 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1022,10 +1022,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may happening concurrently - * it is safe and okay to not free the entry + * a load may be happening concurrently. + * it is safe and okay to not free the entry. * if we free the entry in the following put - * it it either okay to return !0 + * it is also okay to return !0 */ fail: spin_lock(&tree->lock); From fc6697a89f56d9773b2fbff718d4cf2a6d63379d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 25 Feb 2021 17:18:17 -0800 Subject: [PATCH 435/506] mm/zswap: add the flag can_sleep_mapped Patch series "Fix the compatibility of zsmalloc and zswap". Patch #1 adds a flag to zpool, then zswap used to determine if zpool drivers such as zbud/z3fold/zsmalloc will enter an atomic context after mapping. The difference between zbud/z3fold and zsmalloc is that zsmalloc requires an atomic context that since its map function holds a preempt-disabled, but zbud/z3fold don't require an atomic context. So patch #2 sets flag sleep_mapped to true indicating that zbud/z3fold can sleep after mapping. zsmalloc didn't support sleep after mapping, so don't set that flag to true. This patch (of 2): Add a flag to zpool, named is "can_sleep_mapped", and have it set true for zbud/z3fold, not set this flag for zsmalloc, so its default value is false. Then zswap could go the current path if the flag is true; and if it's false, copy data from src to a temporary buffer, then unmap the handle, take the mutex, process the buffer instead of src to avoid sleeping function called from atomic context. [natechancellor@gmail.com: add return value in zswap_frontswap_load] Link: https://lkml.kernel.org/r/20210121214804.926843-1-natechancellor@gmail.com [tiantao6@hisilicon.com: fix potential memory leak] Link: https://lkml.kernel.org/r/1611538365-51811-1-git-send-email-tiantao6@hisilicon.com [colin.king@canonical.com: fix potential uninitialized pointer read on tmp] Link: https://lkml.kernel.org/r/20210128141728.639030-1-colin.king@canonical.com [tiantao6@hisilicon.com: fix variable 'entry' is uninitialized when used] Link: https://lkml.kernel.org/r/1611223030-58346-1-git-send-email-tiantao6@hisilicon.comLink: https://lkml.kernel.org/r/1611035683-12732-1-git-send-email-tiantao6@hisilicon.com Link: https://lkml.kernel.org/r/1611035683-12732-2-git-send-email-tiantao6@hisilicon.com Signed-off-by: Tian Tao Signed-off-by: Nathan Chancellor Signed-off-by: Colin Ian King Reviewed-by: Vitaly Wool Acked-by: Sebastian Andrzej Siewior Reported-by: Mike Galbraith Cc: Barry Song Cc: Dan Streetman Cc: Seth Jennings Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 3 +++ mm/zpool.c | 13 +++++++++++ mm/zswap.c | 51 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 51bf43076165..e8997010612a 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); * @malloc: allocate mem from a pool. * @free: free mem from a pool. * @shrink: shrink the pool. + * @sleep_mapped: whether zpool driver can sleep during map. * @map: map a handle. * @unmap: unmap a handle. * @total_size: get total size of a pool. @@ -100,6 +101,7 @@ struct zpool_driver { int (*shrink)(void *pool, unsigned int pages, unsigned int *reclaimed); + bool sleep_mapped; void *(*map)(void *pool, unsigned long handle, enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); bool zpool_evictable(struct zpool *pool); +bool zpool_can_sleep_mapped(struct zpool *pool); #endif diff --git a/mm/zpool.c b/mm/zpool.c index 3744a2d1a624..5ed71207ced7 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -23,6 +23,7 @@ struct zpool { void *pool; const struct zpool_ops *ops; bool evictable; + bool can_sleep_mapped; struct list_head list; }; @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; zpool->evictable = driver->shrink && ops && ops->evict; + zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) return zpool->evictable; } +/** + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. + * @zpool: The zpool to test + * + * Returns: true if zpool can sleep; false otherwise. + */ +bool zpool_can_sleep_mapped(struct zpool *zpool) +{ + return zpool->can_sleep_mapped; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman "); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zswap.c b/mm/zswap.c index 1e41c2857068..578d9f256920 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + u8 *src, *tmp = NULL; unsigned int dlen; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + if (!zpool_can_sleep_mapped(pool)) { + tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + } + /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ @@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* entry was invalidated */ spin_unlock(&tree->lock); zpool_unmap_handle(pool, handle); + kfree(tmp); return 0; } spin_unlock(&tree->lock); @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) spin_unlock(&tree->lock); end: - zpool_unmap_handle(pool, handle); + if (zpool_can_sleep_mapped(pool)) + zpool_unmap_handle(pool, handle); + else + kfree(tmp); + return ret; } @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst; + u8 *src, *dst, *tmp; unsigned int dlen; int ret; @@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); zswap_fill_page(dst, entry->value); kunmap_atomic(dst); + ret = 0; goto freeentry; } + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto freeentry; + } + } + /* decompress */ dlen = PAGE_SIZE; src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(entry->pool->zpool, entry->handle); + } + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); @@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(entry->pool->zpool)) + zpool_unmap_handle(entry->pool->zpool, entry->handle); + else + kfree(tmp); + BUG_ON(ret); freeentry: @@ -1279,7 +1320,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - return 0; + return ret; } /* frees an entry in zswap */ From e818e820c6a0e819d239264fc863531bbcd72c30 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 25 Feb 2021 17:18:22 -0800 Subject: [PATCH 436/506] mm: set the sleep_mapped to true for zbud and z3fold zpool driver adds a flag to indicate whether the zpool driver can enter an atomic context after mapping. This patch sets it true for z3fold and zbud. Link: https://lkml.kernel.org/r/1611035683-12732-3-git-send-email-tiantao6@hisilicon.com Signed-off-by: Tian Tao Reviewed-by: Vitaly Wool Acked-by: Sebastian Andrzej Siewior Reported-by: Mike Galbraith Cc: Seth Jennings Cc: Dan Streetman Cc: Barry Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 1 + mm/zbud.c | 1 + 2 files changed, 2 insertions(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index c1ccf6bb0ffb..b5dafa7e44e4 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1771,6 +1771,7 @@ static u64 z3fold_zpool_total_size(void *pool) static struct zpool_driver z3fold_zpool_driver = { .type = "z3fold", + .sleep_mapped = true, .owner = THIS_MODULE, .create = z3fold_zpool_create, .destroy = z3fold_zpool_destroy, diff --git a/mm/zbud.c b/mm/zbud.c index c49966ece674..7ec5f27a68b0 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *pool) static struct zpool_driver zbud_zpool_driver = { .type = "zbud", + .sleep_mapped = true, .owner = THIS_MODULE, .create = zbud_zpool_create, .destroy = zbud_zpool_destroy, From f0231305acd53375c6cf736971bf5711105dd6bb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:27 -0800 Subject: [PATCH 437/506] mm/zsmalloc.c: convert to use kmem_cache_zalloc in cache_alloc_zspage() We always memset the zspage allocated via cache_alloc_zspage. So it's more convenient to use kmem_cache_zalloc in cache_alloc_zspage than caller do it manually. Link: https://lkml.kernel.org/r/20210114120032.25885-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7289f502ffac..cf0ed0e4e911 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -357,7 +357,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { - return kmem_cache_alloc(pool->zspage_cachep, + return kmem_cache_zalloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } @@ -1064,7 +1064,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; - memset(zspage, 0, sizeof(struct zspage)); zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); From 2395928158059b8f9858365fce7713ce7fef62e4 Mon Sep 17 00:00:00 2001 From: Rokudo Yan Date: Thu, 25 Feb 2021 17:18:31 -0800 Subject: [PATCH 438/506] zsmalloc: account the number of compacted pages correctly There exists multiple path may do zram compaction concurrently. 1. auto-compaction triggered during memory reclaim 2. userspace utils write zram/compaction node So, multiple threads may call zs_shrinker_scan/zs_compact concurrently. But pages_compacted is a per zsmalloc pool variable and modification of the variable is not serialized(through under class->lock). There are two issues here: 1. the pages_compacted may not equal to total number of pages freed(due to concurrently add). 2. zs_shrinker_scan may not return the correct number of pages freed(issued by current shrinker). The fix is simple: 1. account the number of pages freed in zs_compact locally. 2. use actomic variable pages_compacted to accumulate total number. Link: https://lkml.kernel.org/r/20210202122235.26885-1-wu-yan@tcl.com Fixes: 860c707dca155a56 ("zsmalloc: account the number of compacted pages") Signed-off-by: Rokudo Yan Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- include/linux/zsmalloc.h | 2 +- mm/zsmalloc.c | 17 +++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d7018543842e..a711a2e2a794 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1081,7 +1081,7 @@ static ssize_t mm_stat_show(struct device *dev, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted, + atomic_long_read(&pool_stats.pages_compacted), (u64)atomic64_read(&zram->stats.huge_pages), (u64)atomic64_read(&zram->stats.huge_pages_since)); up_read(&zram->init_lock); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 4807ca4d52e0..2a430e713ce5 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -35,7 +35,7 @@ enum zs_mapmode { struct zs_pool_stats { /* How many pages were migrated (freed) */ - unsigned long pages_compacted; + atomic_long_t pages_compacted; }; struct zs_pool; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cf0ed0e4e911..1518732f95c3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2212,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class) return obj_wasted * class->pages_per_zspage; } -static void __zs_compact(struct zs_pool *pool, struct size_class *class) +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) { struct zs_compact_control cc; struct zspage *src_zspage; struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { @@ -2246,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, dst_zspage); if (putback_zspage(class, src_zspage) == ZS_EMPTY) { free_zspage(pool, class, src_zspage); - pool->stats.pages_compacted += class->pages_per_zspage; + pages_freed += class->pages_per_zspage; } spin_unlock(&class->lock); cond_resched(); @@ -2257,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, src_zspage); spin_unlock(&class->lock); + + return pages_freed; } unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; + unsigned long pages_freed = 0; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2270,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool) continue; if (class->index != i) continue; - __zs_compact(pool, class); + pages_freed += __zs_compact(pool, class); } + atomic_long_add(pages_freed, &pool->stats.pages_compacted); - return pool->stats.pages_compacted; + return pages_freed; } EXPORT_SYMBOL_GPL(zs_compact); @@ -2290,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - pages_freed = pool->stats.pages_compacted; /* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction. */ - pages_freed = zs_compact(pool) - pages_freed; + pages_freed = zs_compact(pool); return pages_freed ? pages_freed : SHRINK_STOP; } From a6c5e0f75b3f7b8ace146f4eaa6398774d39a640 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:34 -0800 Subject: [PATCH 439/506] mm/zsmalloc.c: use page_private() to access page->private It's recommended to use helper macro page_private() to access the private field of page. Use such helper to eliminate direct access. Link: https://lkml.kernel.org/r/20210203091857.20017-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1518732f95c3..30c358b72025 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -816,7 +816,7 @@ static int get_pages_per_zspage(int class_size) static struct zspage *get_zspage(struct page *page) { - struct zspage *zspage = (struct zspage *)page->private; + struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; From 4be408cec257d1156d35647db57726f5ef977630 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 25 Feb 2021 17:18:38 -0800 Subject: [PATCH 440/506] mm: page-flags.h: Typo fix (It -> If) The "If" was wrongly spelled as "It". Link: https://lkml.kernel.org/r/1608959036-91409-1-git-send-email-guoren@kernel.org Signed-off-by: Guo Ren Cc: Oscar Salvador Cc: Alexander Duyck Cc: David Hildenbrand Cc: Steven Price Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index db914477057b..04a34c08e0a6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -810,7 +810,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) /* * Flags checked when a page is freed. Pages being freed should not have - * these flags set. It they are, there is a problem. + * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ @@ -821,7 +821,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) /* * Flags checked when a page is prepped for return by the page allocator. - * Pages being prepped should not have these flags set. It they are set, + * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's From 0f2f89b6de32de49373040eb4ee9d6bc1930ae5a Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 25 Feb 2021 17:18:41 -0800 Subject: [PATCH 441/506] mm/dmapool: use might_alloc() Now that my little helper has landed, use it more. On top of the existing check this also uses lockdep through the fs_reclaim annotations. Link: https://lkml.kernel.org/r/20210113135009.3606813-1-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a97c97232337..f3791532fef2 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -319,7 +320,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(gfpflags_allow_blocking(mem_flags)); + might_alloc(mem_flags); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { From c1ca59a1f21e360b26e26c187a4e42f22bb768d3 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 25 Feb 2021 17:18:45 -0800 Subject: [PATCH 442/506] mm/backing-dev.c: use might_alloc() Now that my little helper has landed, use it more. On top of the existing check this also uses lockdep through the fs_reclaim annotations. [akpm@linux-foundation.org: include linux/sched/mm.h] Link: https://lkml.kernel.org/r/20210113135009.3606813-2-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eca555f658d9..576220acd686 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -578,7 +579,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfpflags_allow_blocking(gfp)); + might_alloc(gfp); if (!memcg_css->parent) return &bdi->wb; From 87005394e14aa2f886581fb51e5e2022dc77ea05 Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Thu, 25 Feb 2021 17:18:48 -0800 Subject: [PATCH 443/506] mm/early_ioremap.c: use __func__ instead of function name It is better to use __func__ instead of function name. Link: https://lkml.kernel.org/r/1611385587-4209-1-git-send-email-stephenzhangzsd@gmail.com Signed-off-by: Stephen Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/early_ioremap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index a0018ad1a1f6..164607c7cdf1 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -181,17 +181,17 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } } - if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", - addr, size)) + if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n", + __func__, addr, size)) return; if (WARN(prev_size[slot] != size, - "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot])) + "%s(%p, %08lx) [%d] size not consistent %08lx\n", + __func__, addr, size, slot, prev_size[slot])) return; - WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", - addr, size, slot); + WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", + __func__, addr, size, slot); virt_addr = (unsigned long)addr; if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) From 0ce20dd840897b12ae70869c69f1ba34d6d16965 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:18:53 -0800 Subject: [PATCH 444/506] mm: add Kernel Electric-Fence infrastructure Patch series "KFENCE: A low-overhead sampling-based memory safety error detector", v7. This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a low-overhead sampling-based memory safety error detector of heap use-after-free, invalid-free, and out-of-bounds access errors. This series enables KFENCE for the x86 and arm64 architectures, and adds KFENCE hooks to the SLAB and SLUB allocators. KFENCE is designed to be enabled in production kernels, and has near zero performance overhead. Compared to KASAN, KFENCE trades performance for precision. The main motivation behind KFENCE's design, is that with enough total uptime KFENCE will detect bugs in code paths not typically exercised by non-production test workloads. One way to quickly achieve a large enough total uptime is when the tool is deployed across a large fleet of machines. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries. The pages to the left and right of the object page are "guard pages", whose attributes are changed to a protected state, and cause page faults on any attempted access to them. Such page faults are then intercepted by KFENCE, which handles the fault gracefully by reporting a memory access error. Guarded allocations are set up based on a sample interval (can be set via kfence.sample_interval). After expiration of the sample interval, the next allocation through the main allocator (SLAB or SLUB) returns a guarded allocation from the KFENCE object pool. At this point, the timer is reset, and the next allocation is set up after the expiration of the interval. To enable/disable a KFENCE allocation through the main allocator's fast-path without overhead, KFENCE relies on static branches via the static keys infrastructure. The static branch is toggled to redirect the allocation to KFENCE. The KFENCE memory pool is of fixed size, and if the pool is exhausted no further KFENCE allocations occur. The default config is conservative with only 255 objects, resulting in a pool size of 2 MiB (with 4 KiB pages). We have verified by running synthetic benchmarks (sysbench I/O, hackbench) and production server-workload benchmarks that a kernel with KFENCE (using sample intervals 100-500ms) is performance-neutral compared to a non-KFENCE baseline kernel. KFENCE is inspired by GWP-ASan [1], a userspace tool with similar properties. The name "KFENCE" is a homage to the Electric Fence Malloc Debugger [2]. For more details, see Documentation/dev-tools/kfence.rst added in the series -- also viewable here: https://raw.githubusercontent.com/google/kasan/kfence/Documentation/dev-tools/kfence.rst [1] http://llvm.org/docs/GwpAsan.html [2] https://linux.die.net/man/3/efence This patch (of 9): This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a low-overhead sampling-based memory safety error detector of heap use-after-free, invalid-free, and out-of-bounds access errors. KFENCE is designed to be enabled in production kernels, and has near zero performance overhead. Compared to KASAN, KFENCE trades performance for precision. The main motivation behind KFENCE's design, is that with enough total uptime KFENCE will detect bugs in code paths not typically exercised by non-production test workloads. One way to quickly achieve a large enough total uptime is when the tool is deployed across a large fleet of machines. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries. The pages to the left and right of the object page are "guard pages", whose attributes are changed to a protected state, and cause page faults on any attempted access to them. Such page faults are then intercepted by KFENCE, which handles the fault gracefully by reporting a memory access error. To detect out-of-bounds writes to memory within the object's page itself, KFENCE also uses pattern-based redzones. The following figure illustrates the page layout: ---+-----------+-----------+-----------+-----------+-----------+--- | xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx | | xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx | | x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x | | xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx | | xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx | | xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx | ---+-----------+-----------+-----------+-----------+-----------+--- Guarded allocations are set up based on a sample interval (can be set via kfence.sample_interval). After expiration of the sample interval, a guarded allocation from the KFENCE object pool is returned to the main allocator (SLAB or SLUB). At this point, the timer is reset, and the next allocation is set up after the expiration of the interval. To enable/disable a KFENCE allocation through the main allocator's fast-path without overhead, KFENCE relies on static branches via the static keys infrastructure. The static branch is toggled to redirect the allocation to KFENCE. To date, we have verified by running synthetic benchmarks (sysbench I/O, hackbench) that a kernel compiled with KFENCE is performance-neutral compared to the non-KFENCE baseline. For more details, see Documentation/dev-tools/kfence.rst (added later in the series). [elver@google.com: fix parameter description for kfence_object_start()] Link: https://lkml.kernel.org/r/20201106092149.GA2851373@elver.google.com [elver@google.com: avoid stalling work queue task without allocations] Link: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com Link: https://lkml.kernel.org/r/20201110135320.3309507-1-elver@google.com [elver@google.com: fix potential deadlock due to wake_up()] Link: https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com Link: https://lkml.kernel.org/r/20210104130749.1768991-1-elver@google.com [elver@google.com: add option to use KFENCE without static keys] Link: https://lkml.kernel.org/r/20210111091544.3287013-1-elver@google.com [elver@google.com: add missing copyright and description headers] Link: https://lkml.kernel.org/r/20210118092159.145934-1-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-2-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Reviewed-by: SeongJae Park Co-developed-by: Marco Elver Reviewed-by: Jann Horn Cc: "H. Peter Anvin" Cc: Paul E. McKenney Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Joern Engel Cc: Kees Cook Cc: Mark Rutland Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 216 +++++++++++ init/main.c | 3 + lib/Kconfig.debug | 1 + lib/Kconfig.kfence | 67 ++++ mm/Makefile | 1 + mm/kfence/Makefile | 3 + mm/kfence/core.c | 840 +++++++++++++++++++++++++++++++++++++++++ mm/kfence/kfence.h | 113 ++++++ mm/kfence/report.c | 240 ++++++++++++ 9 files changed, 1484 insertions(+) create mode 100644 include/linux/kfence.h create mode 100644 lib/Kconfig.kfence create mode 100644 mm/kfence/Makefile create mode 100644 mm/kfence/core.c create mode 100644 mm/kfence/kfence.h create mode 100644 mm/kfence/report.c diff --git a/include/linux/kfence.h b/include/linux/kfence.h new file mode 100644 index 000000000000..81f3911cb298 --- /dev/null +++ b/include/linux/kfence.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault + * handler integration. For more info see Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef _LINUX_KFENCE_H +#define _LINUX_KFENCE_H + +#include +#include + +#ifdef CONFIG_KFENCE + +/* + * We allocate an even number of pages, as it simplifies calculations to map + * address to metadata indices; effectively, the very first page serves as an + * extended guard page, but otherwise has no special purpose. + */ +#define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) +extern char *__kfence_pool; + +#ifdef CONFIG_KFENCE_STATIC_KEYS +#include +DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); +#else +#include +extern atomic_t kfence_allocation_gate; +#endif + +/** + * is_kfence_address() - check if an address belongs to KFENCE pool + * @addr: address to check + * + * Return: true or false depending on whether the address is within the KFENCE + * object range. + * + * KFENCE objects live in a separate page range and are not to be intermixed + * with regular heap objects (e.g. KFENCE objects must never be added to the + * allocator freelists). Failing to do so may and will result in heap + * corruptions, therefore is_kfence_address() must be used to check whether + * an object requires specific handling. + * + * Note: This function may be used in fast-paths, and is performance critical. + * Future changes should take this into account; for instance, we want to avoid + * introducing another load and therefore need to keep KFENCE_POOL_SIZE a + * constant (until immediate patching support is added to the kernel). + */ +static __always_inline bool is_kfence_address(const void *addr) +{ + /* + * The non-NULL check is required in case the __kfence_pool pointer was + * never initialized; keep it in the slow-path after the range-check. + */ + return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr); +} + +/** + * kfence_alloc_pool() - allocate the KFENCE pool via memblock + */ +void __init kfence_alloc_pool(void); + +/** + * kfence_init() - perform KFENCE initialization at boot time + * + * Requires that kfence_alloc_pool() was called before. This sets up the + * allocation gate timer, and requires that workqueues are available. + */ +void __init kfence_init(void); + +/** + * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects + * @s: cache being shut down + * + * Before shutting down a cache, one must ensure there are no remaining objects + * allocated from it. Because KFENCE objects are not referenced from the cache + * directly, we need to check them here. + * + * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does + * not return if allocated objects still exist: it prints an error message and + * simply aborts destruction of a cache, leaking memory. + * + * If the only such objects are KFENCE objects, we will not leak the entire + * cache, but instead try to provide more useful debug info by making allocated + * objects "zombie allocations". Objects may then still be used or freed (which + * is handled gracefully), but usage will result in showing KFENCE error reports + * which include stack traces to the user of the object, the original allocation + * site, and caller to shutdown_cache(). + */ +void kfence_shutdown_cache(struct kmem_cache *s); + +/* + * Allocate a KFENCE object. Allocators must not call this function directly, + * use kfence_alloc() instead. + */ +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); + +/** + * kfence_alloc() - allocate a KFENCE object with a low probability + * @s: struct kmem_cache with object requirements + * @size: exact size of the object to allocate (can be less than @s->size + * e.g. for kmalloc caches) + * @flags: GFP flags + * + * Return: + * * NULL - must proceed with allocating as usual, + * * non-NULL - pointer to a KFENCE object. + * + * kfence_alloc() should be inserted into the heap allocation fast path, + * allowing it to transparently return KFENCE-allocated objects with a low + * probability using a static branch (the probability is controlled by the + * kfence.sample_interval boot parameter). + */ +static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ +#ifdef CONFIG_KFENCE_STATIC_KEYS + if (static_branch_unlikely(&kfence_allocation_key)) +#else + if (unlikely(!atomic_read(&kfence_allocation_gate))) +#endif + return __kfence_alloc(s, size, flags); + return NULL; +} + +/** + * kfence_ksize() - get actual amount of memory allocated for a KFENCE object + * @addr: pointer to a heap object + * + * Return: + * * 0 - not a KFENCE object, must call __ksize() instead, + * * non-0 - this many bytes can be accessed without causing a memory error. + * + * kfence_ksize() returns the number of bytes requested for a KFENCE object at + * allocation time. This number may be less than the object size of the + * corresponding struct kmem_cache. + */ +size_t kfence_ksize(const void *addr); + +/** + * kfence_object_start() - find the beginning of a KFENCE object + * @addr: address within a KFENCE-allocated object + * + * Return: address of the beginning of the object. + * + * SL[AU]B-allocated objects are laid out within a page one by one, so it is + * easy to calculate the beginning of an object given a pointer inside it and + * the object size. The same is not true for KFENCE, which places a single + * object at either end of the page. This helper function is used to find the + * beginning of a KFENCE-allocated object. + */ +void *kfence_object_start(const void *addr); + +/** + * __kfence_free() - release a KFENCE heap object to KFENCE pool + * @addr: object to be freed + * + * Requires: is_kfence_address(addr) + * + * Release a KFENCE object and mark it as freed. + */ +void __kfence_free(void *addr); + +/** + * kfence_free() - try to release an arbitrary heap object to KFENCE pool + * @addr: object to be freed + * + * Return: + * * false - object doesn't belong to KFENCE pool and was ignored, + * * true - object was released to KFENCE pool. + * + * Release a KFENCE object and mark it as freed. May be called on any object, + * even non-KFENCE objects, to simplify integration of the hooks into the + * allocator's free codepath. The allocator must check the return value to + * determine if it was a KFENCE object or not. + */ +static __always_inline __must_check bool kfence_free(void *addr) +{ + if (!is_kfence_address(addr)) + return false; + __kfence_free(addr); + return true; +} + +/** + * kfence_handle_page_fault() - perform page fault handling for KFENCE pages + * @addr: faulting address + * + * Return: + * * false - address outside KFENCE pool, + * * true - page fault handled by KFENCE, no additional handling required. + * + * A page fault inside KFENCE pool indicates a memory error, such as an + * out-of-bounds access, a use-after-free or an invalid memory access. In these + * cases KFENCE prints an error message and marks the offending page as + * present, so that the kernel can proceed. + */ +bool __must_check kfence_handle_page_fault(unsigned long addr); + +#else /* CONFIG_KFENCE */ + +static inline bool is_kfence_address(const void *addr) { return false; } +static inline void kfence_alloc_pool(void) { } +static inline void kfence_init(void) { } +static inline void kfence_shutdown_cache(struct kmem_cache *s) { } +static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } +static inline size_t kfence_ksize(const void *addr) { return 0; } +static inline void *kfence_object_start(const void *addr) { return NULL; } +static inline void __kfence_free(void *addr) { } +static inline bool __must_check kfence_free(void *addr) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; } + +#endif + +#endif /* _LINUX_KFENCE_H */ diff --git a/init/main.c b/init/main.c index e9933cbf60d4..261051070e3c 100644 --- a/init/main.c +++ b/init/main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,7 @@ static void __init mm_init(void) */ page_ext_init_flatmem(); init_mem_debugging_and_hardening(); + kfence_alloc_pool(); report_meminit(); mem_init(); /* page_owner must be initialized after buddy is ready */ @@ -955,6 +957,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) hrtimers_init(); softirq_init(); timekeeping_init(); + kfence_init(); /* * For best initial stack canary entropy, prepare it after: diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f9febffffc21..2779c29d9981 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -938,6 +938,7 @@ config DEBUG_STACKOVERFLOW If in doubt, say "N". source "lib/Kconfig.kasan" +source "lib/Kconfig.kfence" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence new file mode 100644 index 000000000000..b88ac9d6b2e6 --- /dev/null +++ b/lib/Kconfig.kfence @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config HAVE_ARCH_KFENCE + bool + +menuconfig KFENCE + bool "KFENCE: low-overhead sampling-based memory safety error detector" + depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB) + select STACKTRACE + help + KFENCE is a low-overhead sampling-based detector of heap out-of-bounds + access, use-after-free, and invalid-free errors. KFENCE is designed + to have negligible cost to permit enabling it in production + environments. + + Note that, KFENCE is not a substitute for explicit testing with tools + such as KASAN. KFENCE can detect a subset of bugs that KASAN can + detect, albeit at very different performance profiles. If you can + afford to use KASAN, continue using KASAN, for example in test + environments. If your kernel targets production use, and cannot + enable KASAN due to its cost, consider using KFENCE. + +if KFENCE + +config KFENCE_STATIC_KEYS + bool "Use static keys to set up allocations" + default y + depends on JUMP_LABEL # To ensure performance, require jump labels + help + Use static keys (static branches) to set up KFENCE allocations. Using + static keys is normally recommended, because it avoids a dynamic + branch in the allocator's fast path. However, with very low sample + intervals, or on systems that do not support jump labels, a dynamic + branch may still be an acceptable performance trade-off. + +config KFENCE_SAMPLE_INTERVAL + int "Default sample interval in milliseconds" + default 100 + help + The KFENCE sample interval determines the frequency with which heap + allocations will be guarded by KFENCE. May be overridden via boot + parameter "kfence.sample_interval". + + Set this to 0 to disable KFENCE by default, in which case only + setting "kfence.sample_interval" to a non-zero value enables KFENCE. + +config KFENCE_NUM_OBJECTS + int "Number of guarded objects available" + range 1 65535 + default 255 + help + The number of guarded objects available. For each KFENCE object, 2 + pages are required; with one containing the object and two adjacent + ones used as guard pages. + +config KFENCE_STRESS_TEST_FAULTS + int "Stress testing of fault handling and error reporting" if EXPERT + default 0 + help + The inverse probability with which to randomly protect KFENCE object + pages, resulting in spurious use-after-frees. The main purpose of + this option is to stress test KFENCE with concurrent error reports + and allocations/frees. A value of 0 disables stress testing logic. + + Only for KFENCE testing; set to 0 if you are not a KFENCE developer. + +endif # KFENCE diff --git a/mm/Makefile b/mm/Makefile index 135bbb65511a..72227b24a616 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ +obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile new file mode 100644 index 000000000000..d991e9a349f0 --- /dev/null +++ b/mm/kfence/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KFENCE) := core.o report.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c new file mode 100644 index 000000000000..d6a32c13336b --- /dev/null +++ b/mm/kfence/core.c @@ -0,0 +1,840 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE guarded object allocator and fault handling. + * + * Copyright (C) 2020, Google LLC. + */ + +#define pr_fmt(fmt) "kfence: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* Disables KFENCE on the first warning assuming an irrecoverable error. */ +#define KFENCE_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) \ + WRITE_ONCE(kfence_enabled, false); \ + __cond; \ + }) + +/* === Data ================================================================= */ + +static bool kfence_enabled __read_mostly; + +static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kfence." + +static int param_set_sample_interval(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret = kstrtoul(val, 0, &num); + + if (ret < 0) + return ret; + + if (!num) /* Using 0 to indicate KFENCE is disabled. */ + WRITE_ONCE(kfence_enabled, false); + else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) + return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */ + + *((unsigned long *)kp->arg) = num; + return 0; +} + +static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) +{ + if (!READ_ONCE(kfence_enabled)) + return sprintf(buffer, "0\n"); + + return param_get_ulong(buffer, kp); +} + +static const struct kernel_param_ops sample_interval_param_ops = { + .set = param_set_sample_interval, + .get = param_get_sample_interval, +}; +module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); + +/* The pool of pages used for guard pages and objects. */ +char *__kfence_pool __ro_after_init; +EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ + +/* + * Per-object metadata, with one-to-one mapping of object metadata to + * backing pages (in __kfence_pool). + */ +static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); +struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* Freelist with available objects. */ +static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); +static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ + +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* The static key to set up a KFENCE allocation. */ +DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); +#endif + +/* Gates the allocation, ensuring only one succeeds in a given period. */ +atomic_t kfence_allocation_gate = ATOMIC_INIT(1); + +/* Statistics counters for debugfs. */ +enum kfence_counter_id { + KFENCE_COUNTER_ALLOCATED, + KFENCE_COUNTER_ALLOCS, + KFENCE_COUNTER_FREES, + KFENCE_COUNTER_ZOMBIES, + KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_COUNT, +}; +static atomic_long_t counters[KFENCE_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KFENCE_COUNTER_ALLOCATED] = "currently allocated", + [KFENCE_COUNTER_ALLOCS] = "total allocations", + [KFENCE_COUNTER_FREES] = "total frees", + [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", + [KFENCE_COUNTER_BUGS] = "total bugs", +}; +static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); + +/* === Internals ============================================================ */ + +static bool kfence_protect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); +} + +static bool kfence_unprotect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); +} + +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + +static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) +{ + unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; + unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; + + /* The checks do not affect performance; only called from slow-paths. */ + + /* Only call with a pointer into kfence_metadata. */ + if (KFENCE_WARN_ON(meta < kfence_metadata || + meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) + return 0; + + /* + * This metadata object only ever maps to 1 page; verify that the stored + * address is in the expected range. + */ + if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) + return 0; + + return pageaddr; +} + +/* + * Update the object's metadata state, including updating the alloc/free stacks + * depending on the state transition. + */ +static noinline void metadata_update_state(struct kfence_metadata *meta, + enum kfence_object_state next) +{ + struct kfence_track *track = + next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + + lockdep_assert_held(&meta->lock); + + /* + * Skip over 1 (this) functions; noinline ensures we do not accidentally + * skip over the caller by never inlining. + */ + track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + track->pid = task_pid_nr(current); + + /* + * Pairs with READ_ONCE() in + * kfence_shutdown_cache(), + * kfence_handle_page_fault(). + */ + WRITE_ONCE(meta->state, next); +} + +/* Write canary byte to @addr. */ +static inline bool set_canary_byte(u8 *addr) +{ + *addr = KFENCE_CANARY_PATTERN(addr); + return true; +} + +/* Check canary byte at @addr. */ +static inline bool check_canary_byte(u8 *addr) +{ + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) + return true; + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr), + KFENCE_ERROR_CORRUPTION); + return false; +} + +/* __always_inline this to ensure we won't do an indirect call to fn. */ +static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *)) +{ + const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); + unsigned long addr; + + lockdep_assert_held(&meta->lock); + + /* + * We'll iterate over each canary byte per-side until fn() returns + * false. However, we'll still iterate over the canary bytes to the + * right of the object even if there was an error in the canary bytes to + * the left of the object. Specifically, if check_canary_byte() + * generates an error, showing both sides might give more clues as to + * what the error is about when displaying which bytes were corrupted. + */ + + /* Apply to left of object. */ + for (addr = pageaddr; addr < meta->addr; addr++) { + if (!fn((u8 *)addr)) + break; + } + + /* Apply to right of object. */ + for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) { + if (!fn((u8 *)addr)) + break; + } +} + +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) +{ + struct kfence_metadata *meta = NULL; + unsigned long flags; + struct page *page; + void *addr; + + /* Try to obtain a free object. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + if (!list_empty(&kfence_freelist)) { + meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); + list_del_init(&meta->list); + } + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + if (!meta) + return NULL; + + if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { + /* + * This is extremely unlikely -- we are reporting on a + * use-after-free, which locked meta->lock, and the reporting + * code via printk calls kmalloc() which ends up in + * kfence_alloc() and tries to grab the same object that we're + * reporting on. While it has never been observed, lockdep does + * report that there is a possibility of deadlock. Fix it by + * using trylock and bailing out gracefully. + */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + /* Put the object back on the freelist. */ + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + return NULL; + } + + meta->addr = metadata_to_pageaddr(meta); + /* Unprotect if we're reusing this page. */ + if (meta->state == KFENCE_OBJECT_FREED) + kfence_unprotect(meta->addr); + + /* + * Note: for allocations made before RNG initialization, will always + * return zero. We still benefit from enabling KFENCE as early as + * possible, even when the RNG is not yet available, as this will allow + * KFENCE to detect bugs due to earlier allocations. The only downside + * is that the out-of-bounds accesses detected are deterministic for + * such allocations. + */ + if (prandom_u32_max(2)) { + /* Allocate on the "right" side, re-calculate address. */ + meta->addr += PAGE_SIZE - size; + meta->addr = ALIGN_DOWN(meta->addr, cache->align); + } + + addr = (void *)meta->addr; + + /* Update remaining metadata. */ + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); + /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ + WRITE_ONCE(meta->cache, cache); + meta->size = size; + for_each_canary(meta, set_canary_byte); + + /* Set required struct page fields. */ + page = virt_to_page(meta->addr); + page->slab_cache = cache; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Memory initialization. */ + + /* + * We check slab_want_init_on_alloc() ourselves, rather than letting + * SL*B do the initialization, as otherwise we might overwrite KFENCE's + * redzone. + */ + if (unlikely(slab_want_init_on_alloc(gfp, cache))) + memzero_explicit(addr, size); + if (cache->ctor) + cache->ctor(addr); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS)) + kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); + + return addr; +} + +static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) +{ + struct kcsan_scoped_access assert_page_exclusive; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + + if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + /* Invalid or double-free, bail out. */ + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return; + } + + /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ + kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, + KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, + &assert_page_exclusive); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS) + kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ + + /* Restore page protection if there was an OOB access. */ + if (meta->unprotected_page) { + kfence_protect(meta->unprotected_page); + meta->unprotected_page = 0; + } + + /* Check canary bytes for memory corruption. */ + for_each_canary(meta, check_canary_byte); + + /* + * Clear memory if init-on-free is set. While we protect the page, the + * data is still there, and after a use-after-free is detected, we + * unprotect the page, so the data is still accessible. + */ + if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) + memzero_explicit(addr, meta->size); + + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED); + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Protect to detect use-after-frees. */ + kfence_protect((unsigned long)addr); + + kcsan_end_scoped_access(&assert_page_exclusive); + if (!zombie) { + /* Add it to the tail of the freelist for reuse. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + KFENCE_WARN_ON(!list_empty(&meta->list)); + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); + } else { + /* See kfence_shutdown_cache(). */ + atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); + } +} + +static void rcu_guarded_free(struct rcu_head *h) +{ + struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); + + kfence_guarded_free((void *)meta->addr, meta, false); +} + +static bool __init kfence_init_pool(void) +{ + unsigned long addr = (unsigned long)__kfence_pool; + struct page *pages; + int i; + + if (!__kfence_pool) + return false; + + if (!arch_kfence_init_pool()) + goto err; + + pages = virt_to_page(addr); + + /* + * Set up object pages: they must have PG_slab set, to avoid freeing + * these as real pages. + * + * We also want to avoid inserting kfence_free() in the kfree() + * fast-path in SLUB, and therefore need to ensure kfree() correctly + * enters __slab_free() slow-path. + */ + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + if (!i || (i % 2)) + continue; + + /* Verify we do not have a compound head page. */ + if (WARN_ON(compound_head(&pages[i]) != &pages[i])) + goto err; + + __SetPageSlab(&pages[i]); + } + + /* + * Protect the first 2 pages. The first page is mostly unnecessary, and + * merely serves as an extended guard page. However, adding one + * additional page in the beginning gives us an even number of pages, + * which simplifies the mapping of address to metadata index. + */ + for (i = 0; i < 2; i++) { + if (unlikely(!kfence_protect(addr))) + goto err; + + addr += PAGE_SIZE; + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + /* Initialize metadata. */ + INIT_LIST_HEAD(&meta->list); + raw_spin_lock_init(&meta->lock); + meta->state = KFENCE_OBJECT_UNUSED; + meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ + list_add_tail(&meta->list, &kfence_freelist); + + /* Protect the right redzone. */ + if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + goto err; + + addr += 2 * PAGE_SIZE; + } + + return true; + +err: + /* + * Only release unprotected pages, and do not try to go back and change + * page attributes due to risk of failing to do so as well. If changing + * page attributes for some pages fails, it is very likely that it also + * fails for the first page, and therefore expect addr==__kfence_pool in + * most failure cases. + */ + memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + __kfence_pool = NULL; + return false; +} + +/* === DebugFS Interface ==================================================== */ + +static int stats_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); + for (i = 0; i < KFENCE_COUNTER_COUNT; i++) + seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(stats); + +/* + * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. + * start_object() and next_object() return the object index + 1, because NULL is used + * to stop iteration. + */ +static void *start_object(struct seq_file *seq, loff_t *pos) +{ + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static void stop_object(struct seq_file *seq, void *v) +{ +} + +static void *next_object(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static int show_object(struct seq_file *seq, void *v) +{ + struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_print_object(seq, meta); + raw_spin_unlock_irqrestore(&meta->lock, flags); + seq_puts(seq, "---------------------------------\n"); + + return 0; +} + +static const struct seq_operations object_seqops = { + .start = start_object, + .next = next_object, + .stop = stop_object, + .show = show_object, +}; + +static int open_objects(struct inode *inode, struct file *file) +{ + return seq_open(file, &object_seqops); +} + +static const struct file_operations objects_fops = { + .open = open_objects, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int __init kfence_debugfs_init(void) +{ + struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); + + debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); + debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); + return 0; +} + +late_initcall(kfence_debugfs_init); + +/* === Allocation Gate Timer ================================================ */ + +/* + * Set up delayed work, which will enable and disable the static key. We need to + * use a work queue (rather than a simple timer), since enabling and disabling a + * static key cannot be done from an interrupt. + * + * Note: Toggling a static branch currently causes IPIs, and here we'll end up + * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with + * more aggressive sampling intervals), we could get away with a variant that + * avoids IPIs, at the cost of not immediately capturing allocations if the + * instructions remain cached. + */ +static struct delayed_work kfence_timer; +static void toggle_allocation_gate(struct work_struct *work) +{ + if (!READ_ONCE(kfence_enabled)) + return; + + /* Enable static key, and await allocation to happen. */ + atomic_set(&kfence_allocation_gate, 0); +#ifdef CONFIG_KFENCE_STATIC_KEYS + static_branch_enable(&kfence_allocation_key); + /* + * Await an allocation. Timeout after 1 second, in case the kernel stops + * doing allocations, to avoid stalling this worker task for too long. + */ + { + unsigned long end_wait = jiffies + HZ; + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&kfence_allocation_gate) != 0) + break; + schedule_timeout(1); + } while (time_before(jiffies, end_wait)); + __set_current_state(TASK_RUNNING); + } + /* Disable static key and reset timer. */ + static_branch_disable(&kfence_allocation_key); +#endif + schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); +} +static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); + +/* === Public interface ===================================================== */ + +void __init kfence_alloc_pool(void) +{ + if (!kfence_sample_interval) + return; + + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) + pr_err("failed to allocate pool\n"); +} + +void __init kfence_init(void) +{ + /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ + if (!kfence_sample_interval) + return; + + if (!kfence_init_pool()) { + pr_err("%s failed\n", __func__); + return; + } + + WRITE_ONCE(kfence_enabled, true); + schedule_delayed_work(&kfence_timer, 0); + pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS); + if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); + else + pr_cont("\n"); +} + +void kfence_shutdown_cache(struct kmem_cache *s) +{ + unsigned long flags; + struct kfence_metadata *meta; + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + bool in_use; + + meta = &kfence_metadata[i]; + + /* + * If we observe some inconsistent cache and state pair where we + * should have returned false here, cache destruction is racing + * with either kmem_cache_alloc() or kmem_cache_free(). Taking + * the lock will not help, as different critical section + * serialization will have the same outcome. + */ + if (READ_ONCE(meta->cache) != s || + READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + if (in_use) { + /* + * This cache still has allocations, and we should not + * release them back into the freelist so they can still + * safely be used and retain the kernel's default + * behaviour of keeping the allocations alive (leak the + * cache); however, they effectively become "zombie + * allocations" as the KFENCE objects are the only ones + * still in use and the owning cache is being destroyed. + * + * We mark them freed, so that any subsequent use shows + * more useful error messages that will include stack + * traces of the user of the object, the original + * allocation, and caller to shutdown_cache(). + */ + kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); + } + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + meta = &kfence_metadata[i]; + + /* See above. */ + if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) + meta->cache = NULL; + raw_spin_unlock_irqrestore(&meta->lock, flags); + } +} + +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ + /* + * allocation_gate only needs to become non-zero, so it doesn't make + * sense to continue writing to it and pay the associated contention + * cost, in case we have a large number of concurrent allocations. + */ + if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) + return NULL; + + if (!READ_ONCE(kfence_enabled)) + return NULL; + + if (size > PAGE_SIZE) + return NULL; + + return kfence_guarded_alloc(s, size, flags); +} + +size_t kfence_ksize(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? meta->size : 0; +} + +void *kfence_object_start(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? (void *)meta->addr : NULL; +} + +void __kfence_free(void *addr) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing + * the object, as the object page may be recycled for other-typed + * objects once it has been freed. meta->cache may be NULL if the cache + * was destroyed. + */ + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + call_rcu(&meta->rcu_head, rcu_guarded_free); + else + kfence_guarded_free(addr, meta, false); +} + +bool kfence_handle_page_fault(unsigned long addr) +{ + const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; + struct kfence_metadata *to_report = NULL; + enum kfence_error_type error_type; + unsigned long flags; + + if (!is_kfence_address((void *)addr)) + return false; + + if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ + return kfence_unprotect(addr); /* ... unprotect and proceed. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + + if (page_index % 2) { + /* This is a redzone, report a buffer overflow. */ + struct kfence_metadata *meta; + int distance = 0; + + meta = addr_to_metadata(addr - PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + to_report = meta; + /* Data race ok; distance calculation approximate. */ + distance = addr - data_race(meta->addr + meta->size); + } + + meta = addr_to_metadata(addr + PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + /* Data race ok; distance calculation approximate. */ + if (!to_report || distance > data_race(meta->addr) - addr) + to_report = meta; + } + + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + to_report->unprotected_page = addr; + error_type = KFENCE_ERROR_OOB; + + /* + * If the object was freed before we took the look we can still + * report this as an OOB -- the report will simply show the + * stacktrace of the free as well. + */ + } else { + to_report = addr_to_metadata(addr); + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + error_type = KFENCE_ERROR_UAF; + /* + * We may race with __kfence_alloc(), and it is possible that a + * freed object may be reallocated. We simply report this as a + * use-after-free, with the stack trace showing the place where + * the object was re-allocated. + */ + } + +out: + if (to_report) { + kfence_report_error(addr, to_report, error_type); + raw_spin_unlock_irqrestore(&to_report->lock, flags); + } else { + /* This may be a UAF or OOB access, but we can't be sure. */ + kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID); + } + + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ +} diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h new file mode 100644 index 000000000000..1014060f9707 --- /dev/null +++ b/mm/kfence/kfence.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). For more info please see + * Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef MM_KFENCE_KFENCE_H +#define MM_KFENCE_KFENCE_H + +#include +#include +#include +#include + +#include "../slab.h" /* for struct kmem_cache */ + +/* For non-debug builds, avoid leaking kernel pointers into dmesg. */ +#ifdef CONFIG_DEBUG_KERNEL +#define PTR_FMT "%px" +#else +#define PTR_FMT "%p" +#endif + +/* + * Get the canary byte pattern for @addr. Use a pattern that varies based on the + * lower 3 bits of the address, to detect memory corruptions with higher + * probability, where similar constants are used. + */ +#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) + +/* Maximum stack depth for reports. */ +#define KFENCE_STACK_DEPTH 64 + +/* KFENCE object states. */ +enum kfence_object_state { + KFENCE_OBJECT_UNUSED, /* Object is unused. */ + KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ +}; + +/* Alloc/free tracking information. */ +struct kfence_track { + pid_t pid; + int num_stack_entries; + unsigned long stack_entries[KFENCE_STACK_DEPTH]; +}; + +/* KFENCE metadata per guarded allocation. */ +struct kfence_metadata { + struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ + struct rcu_head rcu_head; /* For delayed freeing. */ + + /* + * Lock protecting below data; to ensure consistency of the below data, + * since the following may execute concurrently: __kfence_alloc(), + * __kfence_free(), kfence_handle_page_fault(). However, note that we + * cannot grab the same metadata off the freelist twice, and multiple + * __kfence_alloc() cannot run concurrently on the same metadata. + */ + raw_spinlock_t lock; + + /* The current state of the object; see above. */ + enum kfence_object_state state; + + /* + * Allocated object address; cannot be calculated from size, because of + * alignment requirements. + * + * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. + */ + unsigned long addr; + + /* + * The size of the original allocation. + */ + size_t size; + + /* + * The kmem_cache cache of the last allocation; NULL if never allocated + * or the cache has already been destroyed. + */ + struct kmem_cache *cache; + + /* + * In case of an invalid access, the page that was unprotected; we + * optimistically only store one address. + */ + unsigned long unprotected_page; + + /* Allocation and free stack information. */ + struct kfence_track alloc_track; + struct kfence_track free_track; +}; + +extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* KFENCE error types for report generation. */ +enum kfence_error_type { + KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ + KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ + KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ + KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ + KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ +}; + +void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, + enum kfence_error_type type); + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); + +#endif /* MM_KFENCE_KFENCE_H */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c new file mode 100644 index 000000000000..64f27c8d46a3 --- /dev/null +++ b/mm/kfence/report.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE reporting. + * + * Copyright (C) 2020, Google LLC. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* Helper function to either print to a seq_file or to console. */ +__printf(2, 3) +static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (seq) + seq_vprintf(seq, fmt, args); + else + vprintk(fmt, args); + va_end(args); +} + +/* + * Get the number of stack entries to skip to get out of MM internals. @type is + * optional, and if set to NULL, assumes an allocation or free stack. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries, + const enum kfence_error_type *type) +{ + char buf[64]; + int skipnr, fallback = 0; + bool is_access_fault = false; + + if (type) { + /* Depending on error type, find different stack entries. */ + switch (*type) { + case KFENCE_ERROR_UAF: + case KFENCE_ERROR_OOB: + case KFENCE_ERROR_INVALID: + is_access_fault = true; + break; + case KFENCE_ERROR_CORRUPTION: + case KFENCE_ERROR_INVALID_FREE: + break; + } + } + + for (skipnr = 0; skipnr < num_entries; skipnr++) { + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); + + if (is_access_fault) { + if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len)) + goto found; + } else { + if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || + !strncmp(buf, "__slab_free", len)) { + /* + * In case of tail calls from any of the below + * to any of the above. + */ + fallback = skipnr + 1; + } + + /* Also the *_bulk() variants by only checking prefixes. */ + if (str_has_prefix(buf, "kfree") || + str_has_prefix(buf, "kmem_cache_free") || + str_has_prefix(buf, "__kmalloc") || + str_has_prefix(buf, "kmem_cache_alloc")) + goto found; + } + } + if (fallback < num_entries) + return fallback; +found: + skipnr++; + return skipnr < num_entries ? skipnr : 0; +} + +static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta, + bool show_alloc) +{ + const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + + if (track->num_stack_entries) { + /* Skip allocation/free internals stack. */ + int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + + /* stack_trace_seq_print() does not exist; open code our own. */ + for (; i < track->num_stack_entries; i++) + seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]); + } else { + seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation"); + } +} + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) +{ + const int size = abs(meta->size); + const unsigned long start = meta->addr; + const struct kmem_cache *const cache = meta->cache; + + lockdep_assert_held(&meta->lock); + + if (meta->state == KFENCE_OBJECT_UNUSED) { + seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + return; + } + + seq_con_printf(seq, + "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT + ", size=%d, cache=%s] allocated by task %d:\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, + (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); + kfence_print_stack(seq, meta, true); + + if (meta->state == KFENCE_OBJECT_FREED) { + seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); + kfence_print_stack(seq, meta, false); + } +} + +/* + * Show bytes at @addr that are different from the expected canary values, up to + * @max_bytes. + */ +static void print_diff_canary(unsigned long address, size_t bytes_to_show, + const struct kfence_metadata *meta) +{ + const unsigned long show_until_addr = address + bytes_to_show; + const u8 *cur, *end; + + /* Do not show contents of object nor read into following guard page. */ + end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr) + : min(show_until_addr, PAGE_ALIGN(address))); + + pr_cont("["); + for (cur = (const u8 *)address; cur < end; cur++) { + if (*cur == KFENCE_CANARY_PATTERN(cur)) + pr_cont(" ."); + else if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + pr_cont(" 0x%02x", *cur); + else /* Do not leak kernel memory in non-debug builds. */ + pr_cont(" !"); + } + pr_cont(" ]"); +} + +void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, + enum kfence_error_type type) +{ + unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; + int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + + /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ + if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) + return; + + if (meta) + lockdep_assert_held(&meta->lock); + /* + * Because we may generate reports in printk-unfriendly parts of the + * kernel, such as scheduler code, the use of printk() could deadlock. + * Until such time that all printing code here is safe in all parts of + * the kernel, accept the risk, and just get our message out (given the + * system might already behave unpredictably due to the memory error). + * As such, also disable lockdep to hide warnings, and avoid disabling + * lockdep for the rest of the kernel. + */ + lockdep_off(); + + pr_err("==================================================================\n"); + /* Print report header. */ + switch (type) { + case KFENCE_ERROR_OOB: { + const bool left_of_object = address < meta->addr; + + pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + (void *)address, + left_of_object ? meta->addr - address : address - meta->addr, + left_of_object ? "left" : "right", object_index); + break; + } + case KFENCE_ERROR_UAF: + pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n", + (void *)address, object_index); + break; + case KFENCE_ERROR_CORRUPTION: + pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address); + print_diff_canary(address, 16, meta); + pr_cont(" (in kfence-#%zd):\n", object_index); + break; + case KFENCE_ERROR_INVALID: + pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address); + break; + case KFENCE_ERROR_INVALID_FREE: + pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address, + object_index); + break; + } + + /* Print stack trace and object info. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + + if (meta) { + pr_err("\n"); + kfence_print_object(NULL, meta); + } + + /* Print report footer. */ + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("==================================================================\n"); + + lockdep_on(); + + if (panic_on_warn) + panic("panic_on_warn set ...\n"); + + /* We encountered a memory unsafety error, taint the kernel! */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); +} From 1dc0da6e9ec0f8d735756374697912cd50f402cf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:18:57 -0800 Subject: [PATCH 445/506] x86, kfence: enable KFENCE for x86 Add architecture specific implementation details for KFENCE and enable KFENCE for the x86 architecture. In particular, this implements the required interface in for setting up the pool and providing helper functions for protecting and unprotecting pages. For x86, we need to ensure that the pool uses 4K pages, which is done using the set_memory_4k() helper function. [elver@google.com: add missing copyright and description header] Link: https://lkml.kernel.org/r/20210118092159.145934-2-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-3-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Co-developed-by: Marco Elver Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kfence.h | 70 +++++++++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 5 +++ 3 files changed, 76 insertions(+) create mode 100644 arch/x86/include/asm/kfence.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd4b9b1204a8..2792879d398e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -151,6 +151,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h new file mode 100644 index 000000000000..a0659dbd93ea --- /dev/null +++ b/arch/x86/include/asm/kfence.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KFENCE support. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef _ASM_X86_KFENCE_H +#define _ASM_X86_KFENCE_H + +#include +#include + +#include +#include +#include +#include + +/* + * The page fault handler entry function, up to which the stack trace is + * truncated in reports. + */ +#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault" + +/* Force 4K pages for __kfence_pool. */ +static inline bool arch_kfence_init_pool(void) +{ + unsigned long addr; + + for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); + addr += PAGE_SIZE) { + unsigned int level; + + if (!lookup_address(addr, &level)) + return false; + + if (level != PG_LEVEL_4K) + set_memory_4k(addr, 1); + } + + return true; +} + +/* Protect the given page and flush TLB. */ +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + unsigned int level; + pte_t *pte = lookup_address(addr, &level); + + if (WARN_ON(!pte || level != PG_LEVEL_4K)) + return false; + + /* + * We need to avoid IPIs, as we may get KFENCE allocations or faults + * with interrupts disabled. Therefore, the below is best-effort, and + * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; + * lazy fault handling takes care of faults after the page is PRESENT. + */ + + if (protect) + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + else + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + + /* Flush this CPU's TLB. */ + flush_tlb_one_kernel(addr); + return true; +} + +#endif /* _ASM_X86_KFENCE_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 525197381baa..99fe6d3e690d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -9,6 +9,7 @@ #include /* oops_begin/end, ... */ #include /* search_exception_tables */ #include /* max_low_pfn */ +#include /* kfence_handle_page_fault */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ @@ -680,6 +681,10 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); + /* Only not-present faults should be handled by KFENCE. */ + if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address)) + return; + oops: /* * Oops. The kernel tried to access some bad page. We'll have to From 840b239863449f27bf7522deb81e6746fbfbfeaf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:03 -0800 Subject: [PATCH 446/506] arm64, kfence: enable KFENCE for ARM64 Add architecture specific implementation details for KFENCE and enable KFENCE for the arm64 architecture. In particular, this implements the required interface in . KFENCE requires that attributes for pages from its memory pool can individually be set. Therefore, force the entire linear map to be mapped at page granularity. Doing so may result in extra memory allocated for page tables in case rodata=full is not set; however, currently CONFIG_RODATA_FULL_DEFAULT_ENABLED=y is the default, and the common case is therefore not affected by this change. [elver@google.com: add missing copyright and description header] Link: https://lkml.kernel.org/r/20210118092159.145934-3-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-4-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Reviewed-by: Mark Rutland Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/kfence.h | 24 ++++++++++++++++++++++++ arch/arm64/mm/fault.c | 4 ++++ arch/arm64/mm/mmu.c | 8 +++++++- 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kfence.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fc0ce2a1f3bf..a254f4871683 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -140,6 +140,7 @@ config ARM64 select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE) + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h new file mode 100644 index 000000000000..42a06f83850a --- /dev/null +++ b/arch/arm64/include/asm/kfence.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * arm64 KFENCE support. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef __ASM_KFENCE_H +#define __ASM_KFENCE_H + +#include + +#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync" + +static inline bool arch_kfence_init_pool(void) { return true; } + +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + set_memory_valid(addr, 1, !protect); + + return true; +} + +#endif /* __ASM_KFENCE_H */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index dc9f96442edc..42515900ab2e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -389,6 +390,9 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { + if (kfence_handle_page_fault(addr)) + return; + msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d0758d24a42d..ef7698c4e2f0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1465,7 +1465,13 @@ int arch_add_memory(int nid, u64 start, u64 size, int ret, flags = 0; VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (rodata_full || debug_pagealloc_enabled()) + + /* + * KFENCE requires linear map to be mapped at page granularity, so that + * it is possible to protect/unprotect single pages in the KFENCE pool. + */ + if (rodata_full || debug_pagealloc_enabled() || + IS_ENABLED(CONFIG_KFENCE)) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), From d438fabce7860df3cb9337776be6f90b59ced8ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:08 -0800 Subject: [PATCH 447/506] kfence: use pt_regs to generate stack trace on faults Instead of removing the fault handling portion of the stack trace based on the fault handler's name, just use struct pt_regs directly. Change kfence_handle_page_fault() to take a struct pt_regs, and plumb it through to kfence_report_error() for out-of-bounds, use-after-free, or invalid access errors, where pt_regs is used to generate the stack trace. If the kernel is a DEBUG_KERNEL, also show registers for more information. Link: https://lkml.kernel.org/r/20201105092133.2075331-1-elver@google.com Signed-off-by: Marco Elver Suggested-by: Mark Rutland Acked-by: Mark Rutland Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/kfence.h | 2 -- arch/arm64/mm/fault.c | 2 +- arch/x86/include/asm/kfence.h | 6 ---- arch/x86/mm/fault.c | 2 +- include/linux/kfence.h | 5 +-- mm/kfence/core.c | 10 +++--- mm/kfence/kfence.h | 4 +-- mm/kfence/report.c | 63 +++++++++++++++++++-------------- 8 files changed, 48 insertions(+), 46 deletions(-) diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index 42a06f83850a..d061176d57ea 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -10,8 +10,6 @@ #include -#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync" - static inline bool arch_kfence_init_pool(void) { return true; } static inline bool kfence_protect_page(unsigned long addr, bool protect) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 42515900ab2e..56d9423ca59c 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr)) + if (kfence_handle_page_fault(addr, regs)) return; msg = "paging request"; diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h index a0659dbd93ea..97bbb4a9083a 100644 --- a/arch/x86/include/asm/kfence.h +++ b/arch/x86/include/asm/kfence.h @@ -16,12 +16,6 @@ #include #include -/* - * The page fault handler entry function, up to which the stack trace is - * truncated in reports. - */ -#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault" - /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 99fe6d3e690d..38868b4ce8b0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -682,7 +682,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address)) + if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs)) return; oops: diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 81f3911cb298..5a56bcf5606c 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr) /** * kfence_handle_page_fault() - perform page fault handling for KFENCE pages * @addr: faulting address + * @regs: current struct pt_regs (can be NULL, but shows full stack trace) * * Return: * * false - address outside KFENCE pool, @@ -196,7 +197,7 @@ static __always_inline __must_check bool kfence_free(void *addr) * cases KFENCE prints an error message and marks the offending page as * present, so that the kernel can proceed. */ -bool __must_check kfence_handle_page_fault(unsigned long addr); +bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs); #else /* CONFIG_KFENCE */ @@ -209,7 +210,7 @@ static inline size_t kfence_ksize(const void *addr) { return 0; } static inline void *kfence_object_start(const void *addr) { return NULL; } static inline void __kfence_free(void *addr) { } static inline bool __must_check kfence_free(void *addr) { return false; } -static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; } #endif diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d6a32c13336b..61c76670a7a9 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr), + kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr), KFENCE_ERROR_CORRUPTION); return false; } @@ -351,7 +351,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE); + kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } @@ -766,7 +766,7 @@ void __kfence_free(void *addr) kfence_guarded_free(addr, meta, false); } -bool kfence_handle_page_fault(unsigned long addr) +bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; @@ -829,11 +829,11 @@ bool kfence_handle_page_fault(unsigned long addr) out: if (to_report) { - kfence_report_error(addr, to_report, error_type); + kfence_report_error(addr, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID); + kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 1014060f9707..0d83e628a97d 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -105,8 +105,8 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, - enum kfence_error_type type); +void kfence_report_error(unsigned long address, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 64f27c8d46a3..4dbfa9a382e4 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -41,7 +42,6 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries { char buf[64]; int skipnr, fallback = 0; - bool is_access_fault = false; if (type) { /* Depending on error type, find different stack entries. */ @@ -49,8 +49,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries case KFENCE_ERROR_UAF: case KFENCE_ERROR_OOB: case KFENCE_ERROR_INVALID: - is_access_fault = true; - break; + /* + * kfence_handle_page_fault() may be called with pt_regs + * set to NULL; in that case we'll simply show the full + * stack trace. + */ + return 0; case KFENCE_ERROR_CORRUPTION: case KFENCE_ERROR_INVALID_FREE: break; @@ -60,26 +64,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries for (skipnr = 0; skipnr < num_entries; skipnr++) { int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); - if (is_access_fault) { - if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len)) - goto found; - } else { - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || - !strncmp(buf, "__slab_free", len)) { - /* - * In case of tail calls from any of the below - * to any of the above. - */ - fallback = skipnr + 1; - } - - /* Also the *_bulk() variants by only checking prefixes. */ - if (str_has_prefix(buf, "kfree") || - str_has_prefix(buf, "kmem_cache_free") || - str_has_prefix(buf, "__kmalloc") || - str_has_prefix(buf, "kmem_cache_alloc")) - goto found; + if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || + !strncmp(buf, "__slab_free", len)) { + /* + * In case of tail calls from any of the below + * to any of the above. + */ + fallback = skipnr + 1; } + + /* Also the *_bulk() variants by only checking prefixes. */ + if (str_has_prefix(buf, "kfree") || + str_has_prefix(buf, "kmem_cache_free") || + str_has_prefix(buf, "__kmalloc") || + str_has_prefix(buf, "kmem_cache_alloc")) + goto found; } if (fallback < num_entries) return fallback; @@ -157,13 +156,20 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, pr_cont(" ]"); } -void kfence_report_error(unsigned long address, const struct kfence_metadata *meta, - enum kfence_error_type type) +void kfence_report_error(unsigned long address, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; - int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); - int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + int num_stack_entries; + int skipnr = 0; + + if (regs) { + num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0); + } else { + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + } /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) @@ -227,7 +233,10 @@ void kfence_report_error(unsigned long address, const struct kfence_metadata *me /* Print report footer. */ pr_err("\n"); - dump_stack_print_info(KERN_ERR); + if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs) + show_regs(regs); + else + dump_stack_print_info(KERN_ERR); pr_err("==================================================================\n"); lockdep_on(); From d3fb45f370d927224af35d22d34ea465884afec8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:11 -0800 Subject: [PATCH 448/506] mm, kfence: insert KFENCE hooks for SLAB Inserts KFENCE hooks into the SLAB allocator. To pass the originally requested size to KFENCE, add an argument 'orig_size' to slab_alloc*(). The additional argument is required to preserve the requested original size for kmalloc() allocations, which uses size classes (e.g. an allocation of 272 bytes will return an object of size 512). Therefore, kmem_cache::size does not represent the kmalloc-caller's requested size, and we must introduce the argument 'orig_size' to propagate the originally requested size to KFENCE. Without the originally requested size, we would not be able to detect out-of-bounds accesses for objects placed at the end of a KFENCE object page if that object is not equal to the kmalloc-size class it was bucketed into. When KFENCE is disabled, there is no additional overhead, since slab_alloc*() functions are __always_inline. Link: https://lkml.kernel.org/r/20201103175841.3495947-5-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Co-developed-by: Marco Elver Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Joern Engel Cc: Jonathan Corbet Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 3 +++ mm/kfence/core.c | 2 ++ mm/slab.c | 38 +++++++++++++++++++++++++++++--------- mm/slab_common.c | 5 ++++- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 9eb430c163c2..3aa5e1e73ab6 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -2,6 +2,7 @@ #ifndef _LINUX_SLAB_DEF_H #define _LINUX_SLAB_DEF_H +#include #include /* @@ -114,6 +115,8 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, static inline int objs_per_slab_page(const struct kmem_cache *cache, const struct page *page) { + if (is_kfence_address(page_address(page))) + return 1; return cache->num; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 61c76670a7a9..05c18aa11851 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required struct page fields. */ page = virt_to_page(meta->addr); page->slab_cache = cache; + if (IS_ENABLED(CONFIG_SLAB)) + page->s_mem = addr; raw_spin_unlock_irqrestore(&meta->lock, flags); diff --git a/mm/slab.c b/mm/slab.c index 35c68d99d460..51fd424e0d6d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -3208,7 +3209,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; @@ -3221,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(!cachep)) return NULL; + ptr = kfence_alloc(cachep, orig_size, flags); + if (unlikely(ptr)) + goto out_hooks; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3253,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); +out_hooks: slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); return ptr; } @@ -3290,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3301,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(!cachep)) return NULL; + objp = kfence_alloc(cachep, orig_size, flags); + if (unlikely(objp)) + goto out; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3311,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); +out: slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); return objp; } @@ -3416,6 +3427,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + if (is_kfence_address(objp)) { + kmemleak_free_recursive(objp, cachep->flags); + __kfence_free(objp); + return; + } + if (unlikely(slab_want_init_on_free(cachep))) memset(objp, 0, cachep->object_size); @@ -3482,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = slab_alloc(cachep, flags, _RET_IP_); + void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3515,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); if (unlikely(!objp)) goto error; @@ -3548,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = slab_alloc(cachep, flags, _RET_IP_); + ret = slab_alloc(cachep, flags, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, @@ -3574,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3592,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, @@ -3673,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = slab_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, size, caller); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, @@ -4172,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, BUG_ON(objnr >= cachep->num); /* Find offset within object. */ - offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + if (is_kfence_address(ptr)) + offset = ptr - kfence_object_start(ptr); + else + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); /* Allow address range falling entirely within usercopy region. */ if (offset >= cachep->useroffset && diff --git a/mm/slab_common.c b/mm/slab_common.c index 7c8298c17145..284954ef1da5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -430,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); list_for_each_entry_safe(s, s2, &to_destroy, list) { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_release(s); #else @@ -455,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s) list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_unlink(s); sysfs_slab_release(s); @@ -1235,7 +1238,7 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = __ksize(objp); + size = kfence_ksize(objp) ?: __ksize(objp); /* * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. From b89fb5ef0ce611b5db8eb9d3a5a7fcaab2cbe9e4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:16 -0800 Subject: [PATCH 449/506] mm, kfence: insert KFENCE hooks for SLUB Inserts KFENCE hooks into the SLUB allocator. To pass the originally requested size to KFENCE, add an argument 'orig_size' to slab_alloc*(). The additional argument is required to preserve the requested original size for kmalloc() allocations, which uses size classes (e.g. an allocation of 272 bytes will return an object of size 512). Therefore, kmem_cache::size does not represent the kmalloc-caller's requested size, and we must introduce the argument 'orig_size' to propagate the originally requested size to KFENCE. Without the originally requested size, we would not be able to detect out-of-bounds accesses for objects placed at the end of a KFENCE object page if that object is not equal to the kmalloc-size class it was bucketed into. When KFENCE is disabled, there is no additional overhead, since slab_alloc*() functions are __always_inline. Link: https://lkml.kernel.org/r/20201103175841.3495947-6-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Reviewed-by: Jann Horn Co-developed-by: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 ++ mm/kfence/core.c | 2 ++ mm/slub.c | 60 ++++++++++++++++++++++++++++++---------- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1be0ed5befa1..dcde82a4434c 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -7,6 +7,7 @@ * * (C) 2007 SGI, Christoph Lameter */ +#include #include #include @@ -185,6 +186,8 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, static inline unsigned int obj_to_index(const struct kmem_cache *cache, const struct page *page, void *obj) { + if (is_kfence_address(obj)) + return 0; return __obj_to_index(cache, page_address(page), obj); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 05c18aa11851..7692af715fdb 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required struct page fields. */ page = virt_to_page(meta->addr); page->slab_cache = cache; + if (IS_ENABLED(CONFIG_SLUB)) + page->objects = 1; if (IS_ENABLED(CONFIG_SLAB)) page->s_mem = addr; diff --git a/mm/slub.c b/mm/slub.c index b2833ce85c92..383616af28c4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1570,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *old_tail = *tail ? *tail : *head; int rsize; + if (is_kfence_address(next)) { + slab_free_hook(s, next); + return true; + } + /* Head and tail of the reconstructed freelist */ *head = NULL; *tail = NULL; @@ -2809,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr) + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; struct kmem_cache_cpu *c; @@ -2820,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2892,20 +2903,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(kasan_reset_tag(object), 0, s->object_size); +out: slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); return object; } static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr) + gfp_t gfpflags, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2917,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -2928,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2942,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2976,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); + if (kfence_free(head)) + return; + if (kmem_cache_debug(s) && !free_debug_processing(s, page, head, tail, cnt, addr)) return; @@ -3220,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, df->s = cache_from_obj(s, object); /* Support for memcg */ } + if (is_kfence_address(object)) { + slab_free_hook(df->s, object); + __kfence_free(object); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Start new detached freelist */ df->page = page; set_freepointer(df->s, object, NULL); @@ -3295,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); for (i = 0; i < size; i++) { - void *object = c->freelist; + void *object = kfence_alloc(s, s->object_size, flags); + if (unlikely(object)) { + p[i] = object; + continue; + } + + object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -4021,7 +4049,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -4069,7 +4097,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -4095,6 +4123,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, struct kmem_cache *s; unsigned int offset; size_t object_size; + bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -4107,10 +4136,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, to_user, 0, n); /* Find offset within object. */ - offset = (ptr - page_address(page)) % s->size; + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - page_address(page)) % s->size; /* Adjust for redzone and reject if within the redzone. */ - if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { if (offset < s->red_left_pad) usercopy_abort("SLUB object in left red zone", s->name, to_user, offset, n); @@ -4527,7 +4559,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, caller); + ret = slab_alloc(s, gfpflags, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4558,7 +4590,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); From 2b8305260fb37fc20e13f71e13073304d0a031c8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:21 -0800 Subject: [PATCH 450/506] kfence, kasan: make KFENCE compatible with KASAN Make KFENCE compatible with KASAN. Currently this helps test KFENCE itself, where KASAN can catch potential corruptions to KFENCE state, or other corruptions that may be a result of freepointer corruptions in the main allocators. [akpm@linux-foundation.org: merge fixup] [andreyknvl@google.com: untag addresses for KFENCE] Link: https://lkml.kernel.org/r/9dc196006921b191d25d10f6e611316db7da2efc.1611946152.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-7-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Alexander Potapenko Signed-off-by: Andrey Konovalov Reviewed-by: Dmitry Vyukov Reviewed-by: Jann Horn Co-developed-by: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kfence | 2 +- mm/kasan/common.c | 6 ++++++ mm/kasan/generic.c | 3 ++- mm/kasan/kasan.h | 21 ++++++++++++++++++--- mm/kasan/shadow.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index b88ac9d6b2e6..edfecb5d6165 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -5,7 +5,7 @@ config HAVE_ARCH_KFENCE menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" - depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB) + depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) select STACKTRACE help KFENCE is a low-overhead sampling-based detector of heap out-of-bounds diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b18189ef3a92..af1768c4fee5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -335,6 +335,9 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, tagged_object = object; object = kasan_reset_tag(object); + if (is_kfence_address(object)) + return false; + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { kasan_report_invalid_free(tagged_object, ip); @@ -413,6 +416,9 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, if (unlikely(object == NULL)) return NULL; + if (is_kfence_address(kasan_reset_tag(object))) + return (void *)object; + redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 3f17a1218055..2e55e0f82f39 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -331,7 +332,7 @@ void kasan_record_aux_stack(void *addr) struct kasan_alloc_meta *alloc_meta; void *object; - if (!(page && PageSlab(page))) + if (is_kfence_address(addr) || !(page && PageSlab(page))) return; cache = page->slab_cache; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc14b6e6c14c..fb883740fd27 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -3,6 +3,7 @@ #define __MM_KASAN_KASAN_H #include +#include #include #ifdef CONFIG_KASAN_HW_TAGS @@ -331,14 +332,28 @@ static inline u8 kasan_random_tag(void) { return 0; } static inline void kasan_poison(const void *address, size_t size, u8 value) { - hw_set_mem_tag_range(kasan_reset_tag(address), + address = kasan_reset_tag(address); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + + hw_set_mem_tag_range((void *)address, round_up(size, KASAN_GRANULE_SIZE), value); } static inline void kasan_unpoison(const void *address, size_t size) { - hw_set_mem_tag_range(kasan_reset_tag(address), - round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); + u8 tag = get_tag(address); + + address = kasan_reset_tag(address); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + + hw_set_mem_tag_range((void *)address, + round_up(size, KASAN_GRANULE_SIZE), tag); } static inline bool kasan_byte_accessible(const void *addr) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 80adc85d0393..1372a2fc0ca9 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,10 @@ void kasan_poison(const void *address, size_t size, u8 value) address = kasan_reset_tag(address); size = round_up(size, KASAN_GRANULE_SIZE); + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(address)) + return; + shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); @@ -102,6 +107,14 @@ void kasan_unpoison(const void *address, size_t size) */ address = kasan_reset_tag(address); + /* + * Skip KFENCE memory if called explicitly outside of sl*b. Also note + * that calls to ksize(), where size is not a multiple of machine-word + * size, would otherwise poison the invalid portion of the word. + */ + if (is_kfence_address(address)) + return; + kasan_poison(address, size, tag); if (size & KASAN_GRANULE_MASK) { From 10efe55f883f2396a0024891ad1d7d5d040364b3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:26 -0800 Subject: [PATCH 451/506] kfence, Documentation: add KFENCE documentation Add KFENCE documentation in dev-tools/kfence.rst, and add to index. [elver@google.com: add missing copyright header to documentation] Link: https://lkml.kernel.org/r/20210118092159.145934-4-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-8-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/index.rst | 1 + Documentation/dev-tools/kfence.rst | 298 +++++++++++++++++++++++++++++ lib/Kconfig.kfence | 2 + 3 files changed, 301 insertions(+) create mode 100644 Documentation/dev-tools/kfence.rst diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index f7809c7b1ba9..1b1cf4f5c9d9 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -22,6 +22,7 @@ whole; patches welcome! ubsan kmemleak kcsan + kfence gdb-kernel-debugging kgdb kselftest diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst new file mode 100644 index 000000000000..0e2fb6ef3016 --- /dev/null +++ b/Documentation/dev-tools/kfence.rst @@ -0,0 +1,298 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2020, Google LLC. + +Kernel Electric-Fence (KFENCE) +============================== + +Kernel Electric-Fence (KFENCE) is a low-overhead sampling-based memory safety +error detector. KFENCE detects heap out-of-bounds access, use-after-free, and +invalid-free errors. + +KFENCE is designed to be enabled in production kernels, and has near zero +performance overhead. Compared to KASAN, KFENCE trades performance for +precision. The main motivation behind KFENCE's design, is that with enough +total uptime KFENCE will detect bugs in code paths not typically exercised by +non-production test workloads. One way to quickly achieve a large enough total +uptime is when the tool is deployed across a large fleet of machines. + +Usage +----- + +To enable KFENCE, configure the kernel with:: + + CONFIG_KFENCE=y + +To build a kernel with KFENCE support, but disabled by default (to enable, set +``kfence.sample_interval`` to non-zero value), configure the kernel with:: + + CONFIG_KFENCE=y + CONFIG_KFENCE_SAMPLE_INTERVAL=0 + +KFENCE provides several other configuration options to customize behaviour (see +the respective help text in ``lib/Kconfig.kfence`` for more info). + +Tuning performance +~~~~~~~~~~~~~~~~~~ + +The most important parameter is KFENCE's sample interval, which can be set via +the kernel boot parameter ``kfence.sample_interval`` in milliseconds. The +sample interval determines the frequency with which heap allocations will be +guarded by KFENCE. The default is configurable via the Kconfig option +``CONFIG_KFENCE_SAMPLE_INTERVAL``. Setting ``kfence.sample_interval=0`` +disables KFENCE. + +The KFENCE memory pool is of fixed size, and if the pool is exhausted, no +further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default +255), the number of available guarded objects can be controlled. Each object +requires 2 pages, one for the object itself and the other one used as a guard +page; object pages are interleaved with guard pages, and every object page is +therefore surrounded by two guard pages. + +The total memory dedicated to the KFENCE memory pool can be computed as:: + + ( #objects + 1 ) * 2 * PAGE_SIZE + +Using the default config, and assuming a page size of 4 KiB, results in +dedicating 2 MiB to the KFENCE memory pool. + +Note: On architectures that support huge pages, KFENCE will ensure that the +pool is using pages of size ``PAGE_SIZE``. This will result in additional page +tables being allocated. + +Error reports +~~~~~~~~~~~~~ + +A typical out-of-bounds access looks like this:: + + ================================================================== + BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b + + Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17): + test_out_of_bounds_read+0xa3/0x22b + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_out_of_bounds_read+0x98/0x22b + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +The header of the report provides a short summary of the function involved in +the access. It is followed by more detailed information about the access and +its origin. Note that, real kernel addresses are only shown for +``CONFIG_DEBUG_KERNEL=y`` builds. + +Use-after-free accesses are reported as:: + + ================================================================== + BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143 + + Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24): + test_use_after_free_read+0xb3/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_use_after_free_read+0x76/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + freed by task 507: + test_use_after_free_read+0xa8/0x143 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +KFENCE also reports on invalid frees, such as double-frees:: + + ================================================================== + BUG: KFENCE: invalid free in test_double_free+0xdc/0x171 + + Invalid free of 0xffffffffb6741000: + test_double_free+0xdc/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507: + test_alloc+0xf3/0x25b + test_double_free+0x76/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + freed by task 507: + test_double_free+0xa8/0x171 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +KFENCE also uses pattern-based redzones on the other side of an object's guard +page, to detect out-of-bounds writes on the unprotected side of the object. +These are reported on frees:: + + ================================================================== + BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184 + + Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69): + test_kmalloc_aligned_oob_write+0xef/0x184 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507: + test_alloc+0xf3/0x25b + test_kmalloc_aligned_oob_write+0x57/0x184 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +For such errors, the address where the corruption occurred as well as the +invalidly written bytes (offset from the address) are shown; in this +representation, '.' denote untouched bytes. In the example above ``0xac`` is +the value written to the invalid address at offset 0, and the remaining '.' +denote that no following bytes have been touched. Note that, real values are +only shown for ``CONFIG_DEBUG_KERNEL=y`` builds; to avoid information +disclosure for non-debug builds, '!' is used instead to denote invalidly +written bytes. + +And finally, KFENCE may also report on invalid accesses to any protected page +where it was not possible to determine an associated object, e.g. if adjacent +object pages had not yet been allocated:: + + ================================================================== + BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0 + + Invalid access at 0xffffffffb670b00a: + test_invalid_access+0x26/0xe0 + kunit_try_run_case+0x51/0x85 + kunit_generic_run_threadfn_adapter+0x16/0x30 + kthread+0x137/0x160 + ret_from_fork+0x22/0x30 + + CPU: 4 PID: 124 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + ================================================================== + +DebugFS interface +~~~~~~~~~~~~~~~~~ + +Some debugging information is exposed via debugfs: + +* The file ``/sys/kernel/debug/kfence/stats`` provides runtime statistics. + +* The file ``/sys/kernel/debug/kfence/objects`` provides a list of objects + allocated via KFENCE, including those already freed but protected. + +Implementation Details +---------------------- + +Guarded allocations are set up based on the sample interval. After expiration +of the sample interval, the next allocation through the main allocator (SLAB or +SLUB) returns a guarded allocation from the KFENCE object pool (allocation +sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and +the next allocation is set up after the expiration of the interval. To "gate" a +KFENCE allocation through the main allocator's fast-path without overhead, +KFENCE relies on static branches via the static keys infrastructure. The static +branch is toggled to redirect the allocation to KFENCE. + +KFENCE objects each reside on a dedicated page, at either the left or right +page boundaries selected at random. The pages to the left and right of the +object page are "guard pages", whose attributes are changed to a protected +state, and cause page faults on any attempted access. Such page faults are then +intercepted by KFENCE, which handles the fault gracefully by reporting an +out-of-bounds access, and marking the page as accessible so that the faulting +code can (wrongly) continue executing (set ``panic_on_warn`` to panic instead). + +To detect out-of-bounds writes to memory within the object's page itself, +KFENCE also uses pattern-based redzones. For each object page, a redzone is set +up for all non-object memory. For typical alignments, the redzone is only +required on the unguarded side of an object. Because KFENCE must honor the +cache's requested alignment, special alignments may result in unprotected gaps +on either side of an object, all of which are redzoned. + +The following figure illustrates the page layout:: + + ---+-----------+-----------+-----------+-----------+-----------+--- + | xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx | + | xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx | + | x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x | + | xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx | + | xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx | + | xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx | + ---+-----------+-----------+-----------+-----------+-----------+--- + +Upon deallocation of a KFENCE object, the object's page is again protected and +the object is marked as freed. Any further access to the object causes a fault +and KFENCE reports a use-after-free access. Freed objects are inserted at the +tail of KFENCE's freelist, so that the least recently freed objects are reused +first, and the chances of detecting use-after-frees of recently freed objects +is increased. + +Interface +--------- + +The following describes the functions which are used by allocators as well as +page handling code to set up and deal with KFENCE allocations. + +.. kernel-doc:: include/linux/kfence.h + :functions: is_kfence_address + kfence_shutdown_cache + kfence_alloc kfence_free __kfence_free + kfence_ksize kfence_object_start + kfence_handle_page_fault + +Related Tools +------------- + +In userspace, a similar approach is taken by `GWP-ASan +`_. GWP-ASan also relies on guard pages and +a sampling strategy to detect memory unsafety bugs at scale. KFENCE's design is +directly influenced by GWP-ASan, and can be seen as its kernel sibling. Another +similar but non-sampling approach, that also inspired the name "KFENCE", can be +found in the userspace `Electric Fence Malloc Debugger +`_. + +In the kernel, several tools exist to debug memory access errors, and in +particular KASAN can detect all bug classes that KFENCE can detect. While KASAN +is more precise, relying on compiler instrumentation, this comes at a +performance cost. + +It is worth highlighting that KASAN and KFENCE are complementary, with +different target environments. For instance, KASAN is the better debugging-aid, +where test cases or reproducers exists: due to the lower chance to detect the +error, it would require more effort using KFENCE to debug. Deployments at scale +that cannot afford to enable KASAN, however, would benefit from using KFENCE to +discover bugs due to code paths not exercised by test cases or fuzzers. diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index edfecb5d6165..605125ac2ae0 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -13,6 +13,8 @@ menuconfig KFENCE to have negligible cost to permit enabling it in production environments. + See for more details. + Note that, KFENCE is not a substitute for explicit testing with tools such as KASAN. KFENCE can detect a subset of bugs that KASAN can detect, albeit at very different performance profiles. If you can From bc8fbc5f305aecf63423da91e5faf4c0ce40bf38 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:31 -0800 Subject: [PATCH 452/506] kfence: add test suite Add KFENCE test suite, testing various error detection scenarios. Makes use of KUnit for test organization. Since KFENCE's interface to obtain error reports is via the console, the test verifies that KFENCE outputs expected reports to the console. [elver@google.com: fix typo in test] Link: https://lkml.kernel.org/r/X9lHQExmHGvETxY4@elver.google.com [elver@google.com: show access type in report] Link: https://lkml.kernel.org/r/20210111091544.3287013-2-elver@google.com Link: https://lkml.kernel.org/r/20201103175841.3495947-9-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Co-developed-by: Alexander Potapenko Reviewed-by: Jann Horn Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: SeongJae Park Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 12 +- arch/arm64/mm/fault.c | 2 +- arch/x86/mm/fault.c | 3 +- include/linux/kfence.h | 9 +- lib/Kconfig.kfence | 13 + mm/kfence/Makefile | 3 + mm/kfence/core.c | 11 +- mm/kfence/kfence.h | 2 +- mm/kfence/kfence_test.c | 858 +++++++++++++++++++++++++++++ mm/kfence/report.c | 27 +- 10 files changed, 915 insertions(+), 25 deletions(-) create mode 100644 mm/kfence/kfence_test.c diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 0e2fb6ef3016..58a0a5fa1ddc 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -65,9 +65,9 @@ Error reports A typical out-of-bounds access looks like this:: ================================================================== - BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b + BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b - Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17): + Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17): test_out_of_bounds_read+0xa3/0x22b kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 @@ -94,9 +94,9 @@ its origin. Note that, real kernel addresses are only shown for Use-after-free accesses are reported as:: ================================================================== - BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143 + BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143 - Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24): + Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24): test_use_after_free_read+0xb3/0x143 kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 @@ -193,9 +193,9 @@ where it was not possible to determine an associated object, e.g. if adjacent object pages had not yet been allocated:: ================================================================== - BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0 + BUG: KFENCE: invalid read in test_invalid_access+0x26/0xe0 - Invalid access at 0xffffffffb670b00a: + Invalid read at 0xffffffffb670b00a: test_invalid_access+0x26/0xe0 kunit_try_run_case+0x51/0x85 kunit_generic_run_threadfn_adapter+0x16/0x30 diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 56d9423ca59c..f37d4e3830b7 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr, regs)) + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; msg = "paging request"; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 38868b4ce8b0..a73347e2cdfc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -682,7 +682,8 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs)) + if (!(error_code & X86_PF_PROT) && + kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 5a56bcf5606c..a70d1ea03532 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr) /** * kfence_handle_page_fault() - perform page fault handling for KFENCE pages * @addr: faulting address + * @is_write: is access a write * @regs: current struct pt_regs (can be NULL, but shows full stack trace) * * Return: @@ -197,7 +198,7 @@ static __always_inline __must_check bool kfence_free(void *addr) * cases KFENCE prints an error message and marks the offending page as * present, so that the kernel can proceed. */ -bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs); +bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); #else /* CONFIG_KFENCE */ @@ -210,7 +211,11 @@ static inline size_t kfence_ksize(const void *addr) { return 0; } static inline void *kfence_object_start(const void *addr) { return NULL; } static inline void __kfence_free(void *addr) { } static inline bool __must_check kfence_free(void *addr) { return false; } -static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; } +static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, + struct pt_regs *regs) +{ + return false; +} #endif diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 605125ac2ae0..78f50ccb3b45 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -66,4 +66,17 @@ config KFENCE_STRESS_TEST_FAULTS Only for KFENCE testing; set to 0 if you are not a KFENCE developer. +config KFENCE_KUNIT_TEST + tristate "KFENCE integration test suite" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS + depends on TRACEPOINTS && KUNIT + help + Test suite for KFENCE, testing various error detection scenarios with + various allocation types, and checking that reports are correctly + output to console. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. + endif # KFENCE diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index d991e9a349f0..6872cd5e5390 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KFENCE) := core.o report.o + +CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7692af715fdb..cfe3d32ac5b7 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr), + kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), KFENCE_ERROR_CORRUPTION); return false; } @@ -355,7 +355,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE); + kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } @@ -770,7 +771,7 @@ void __kfence_free(void *addr) kfence_guarded_free(addr, meta, false); } -bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) +bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; @@ -833,11 +834,11 @@ bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) out: if (to_report) { - kfence_report_error(addr, regs, to_report, error_type); + kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID); + kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 0d83e628a97d..1accc840dbbe 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -105,7 +105,7 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, struct pt_regs *regs, +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c new file mode 100644 index 000000000000..db1bb596acaf --- /dev/null +++ b/mm/kfence/kfence_test.c @@ -0,0 +1,858 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KFENCE memory safety error detector. Since the interface with + * which KFENCE's reports are obtained is via the console, this is the output we + * should verify. For each test case checks the presence (or absence) of + * generated reports. Relies on 'console' tracepoint to capture reports as they + * appear in the kernel log. + * + * Copyright (C) 2020, Google LLC. + * Author: Alexander Potapenko + * Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kfence.h" + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[2][256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) { + /* + * KFENCE report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + } + + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Information we expect in a report. */ +struct expect_report { + enum kfence_error_type type; /* The type or error. */ + void *fn; /* Function pointer to expected function where access occurred. */ + char *addr; /* Address at which the bad access occurred. */ + bool is_write; /* Is access a write. */ +}; + +static const char *get_access_type(const struct expect_report *r) +{ + return r->is_write ? "write" : "read"; +} + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", + get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s", + get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s", + get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free"); + break; + } + + scnprintf(cur, end - cur, " in %pS", r->fn); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + + /* Access information */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "Corrupted memory at"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "Invalid free of"); + break; + } + + cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr); + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test cases ===== */ + +#define TEST_PRIV_WANT_MEMCACHE ((void *)1) + +/* Cache used by tests; if NULL, allocate from kmalloc instead. */ +static struct kmem_cache *test_cache; + +static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags, + void (*ctor)(void *)) +{ + if (test->priv != TEST_PRIV_WANT_MEMCACHE) + return size; + + kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor); + + /* + * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any + * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to + * allocate via memcg, if enabled. + */ + flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT; + test_cache = kmem_cache_create("test", size, 1, flags, ctor); + KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache"); + + return size; +} + +static void test_cache_destroy(void) +{ + if (!test_cache) + return; + + kmem_cache_destroy(test_cache); + test_cache = NULL; +} + +static inline size_t kmalloc_cache_alignment(size_t size) +{ + return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align; +} + +/* Must always inline to match stack trace against caller. */ +static __always_inline void test_free(void *ptr) +{ + if (test_cache) + kmem_cache_free(test_cache, ptr); + else + kfree(ptr); +} + +/* + * If this should be a KFENCE allocation, and on which side the allocation and + * the closest guard page should be. + */ +enum allocation_policy { + ALLOCATE_ANY, /* KFENCE, any side. */ + ALLOCATE_LEFT, /* KFENCE, left side of page. */ + ALLOCATE_RIGHT, /* KFENCE, right side of page. */ + ALLOCATE_NONE, /* No KFENCE allocation. */ +}; + +/* + * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the + * current test_cache if set up. + */ +static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy) +{ + void *alloc; + unsigned long timeout, resched_after; + const char *policy_name; + + switch (policy) { + case ALLOCATE_ANY: + policy_name = "any"; + break; + case ALLOCATE_LEFT: + policy_name = "left"; + break; + case ALLOCATE_RIGHT: + policy_name = "right"; + break; + case ALLOCATE_NONE: + policy_name = "none"; + break; + } + + kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + policy_name, !!test_cache); + + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + /* + * Especially for non-preemption kernels, ensure the allocation-gate + * timer can catch up: after @resched_after, every failed allocation + * attempt yields, to ensure the allocation-gate timer is scheduled. + */ + resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + if (test_cache) + alloc = kmem_cache_alloc(test_cache, gfp); + else + alloc = kmalloc(size, gfp); + + if (is_kfence_address(alloc)) { + struct page *page = virt_to_head_page(alloc); + struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]; + + /* + * Verify that various helpers return the right values + * even for KFENCE objects; these are required so that + * memcg accounting works correctly. + */ + KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U); + KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1); + + if (policy == ALLOCATE_ANY) + return alloc; + if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + if (policy == ALLOCATE_RIGHT && + !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + } else if (policy == ALLOCATE_NONE) + return alloc; + + test_free(alloc); + + if (time_after(jiffies, resched_after)) + cond_resched(); + } while (time_before(jiffies, timeout)); + + KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE"); + return NULL; /* Unreachable. */ +} + +static void test_out_of_bounds_read(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_read, + .is_write = false, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* + * If we don't have our own cache, adjust based on alignment, so that we + * actually access guard pages on either side. + */ + if (!test_cache) + size = kmalloc_cache_alignment(size); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf + size; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_out_of_bounds_write(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_write, + .is_write = true, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_use_after_free_read(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_use_after_free_read, + .is_write = false, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_double_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_double_free, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + test_free(expect.addr); /* Double-free. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_invalid_addr_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_invalid_addr_free, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + expect.addr = buf + 1; /* Free on invalid address. */ + test_free(expect.addr); /* Invalid address free. */ + test_free(buf); /* No error. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_corruption(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_corruption, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * KFENCE is unable to detect an OOB if the allocation's alignment requirements + * leave a gap between the object and the guard page. Specifically, an + * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB + * respectively. Therefore it is impossible for the allocated object to + * contiguously line up with the right guard page. + * + * However, we test that an access to memory beyond the gap results in KFENCE + * detecting an OOB access. + */ +static void test_kmalloc_aligned_oob_read(struct kunit *test) +{ + const size_t size = 73; + const size_t align = kmalloc_cache_alignment(size); + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_kmalloc_aligned_oob_read, + .is_write = false, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + + /* + * The object is offset to the right, so there won't be an OOB to the + * left of it. + */ + READ_ONCE(*(buf - 1)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* + * @buf must be aligned on @align, therefore buf + size belongs to the + * same page -> no OOB. + */ + READ_ONCE(*(buf + size)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Overflowing by @align bytes will result in an OOB. */ + expect.addr = buf + size + align; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + test_free(buf); +} + +static void test_kmalloc_aligned_oob_write(struct kunit *test) +{ + const size_t size = 73; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_kmalloc_aligned_oob_write, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + /* + * The object is offset to the right, so we won't get a page + * fault immediately after it. + */ + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1); + KUNIT_EXPECT_FALSE(test, report_available()); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test cache shrinking and destroying with KFENCE. */ +static void test_shrink_memcache(struct kunit *test) +{ + const size_t size = 32; + void *buf; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + kmem_cache_shrink(test_cache); + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void ctor_set_x(void *obj) +{ + /* Every object has at least 8 bytes. */ + memset(obj, 'x', 8); +} + +/* Ensure that SL*B does not modify KFENCE objects on bulk free. */ +static void test_free_bulk(struct kunit *test) +{ + int iter; + + for (iter = 0; iter < 5; iter++) { + const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, + (iter & 1) ? ctor_set_x : NULL); + void *objects[] = { + test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + }; + + kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects); + KUNIT_ASSERT_FALSE(test, report_available()); + test_cache_destroy(); + } +} + +/* Test init-on-free works. */ +static void test_init_on_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_init_on_free, + .is_write = false, + }; + int i; + + if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)) + return; + /* Assume it hasn't been disabled on command line. */ + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + expect.addr[i] = i + 1; + test_free(expect.addr); + + for (i = 0; i < size; i++) { + /* + * This may fail if the page was recycled by KFENCE and then + * written to again -- this however, is near impossible with a + * default config. + */ + KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0); + + if (!i) /* Only check first access to not fail test if page is ever re-protected. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + } +} + +/* Ensure that constructors work properly. */ +static void test_memcache_ctor(struct kunit *test) +{ + const size_t size = 32; + char *buf; + int i; + + setup_test_cache(test, size, 0, ctor_set_x); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + + for (i = 0; i < 8; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)'x'); + + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* Test that memory is zeroed if requested. */ +static void test_gfpzero(struct kunit *test) +{ + const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */ + char *buf1, *buf2; + int i; + + if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) { + kunit_warn(test, "skipping ... would take too long\n"); + return; + } + + setup_test_cache(test, size, 0, NULL); + buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + buf1[i] = i + 1; + test_free(buf1); + + /* Try to get same address again -- this can take a while. */ + for (i = 0;; i++) { + buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY); + if (buf1 == buf2) + break; + test_free(buf2); + + if (i == CONFIG_KFENCE_NUM_OBJECTS) { + kunit_warn(test, "giving up ... cannot get same object back\n"); + return; + } + } + + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf2[i], (char)0); + + test_free(buf2); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void test_invalid_access(struct kunit *test) +{ + const struct expect_report expect = { + .type = KFENCE_ERROR_INVALID, + .fn = test_invalid_access, + .addr = &__kfence_pool[10], + .is_write = false, + }; + + READ_ONCE(__kfence_pool[10]); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test SLAB_TYPESAFE_BY_RCU works. */ +static void test_memcache_typesafe_by_rcu(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_memcache_typesafe_by_rcu, + .is_write = false, + }; + + setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + *expect.addr = 42; + + rcu_read_lock(); + test_free(expect.addr); + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + /* + * Up to this point, memory should not have been freed yet, and + * therefore there should be no KFENCE report from the above access. + */ + rcu_read_unlock(); + + /* Above access to @expect.addr should not have generated a report! */ + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Only after rcu_barrier() is the memory guaranteed to be freed. */ + rcu_barrier(); + + /* Expect use-after-free. */ + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test krealloc(). */ +static void test_krealloc(struct kunit *test) +{ + const size_t size = 32; + const struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_krealloc, + .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY), + .is_write = false, + }; + char *buf = expect.addr; + int i; + + KUNIT_EXPECT_FALSE(test, test_cache); + KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */ + for (i = 0; i < size; i++) + buf[i] = i + 1; + + /* Check that we successfully change the size. */ + buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */ + /* Note: Might no longer be a KFENCE alloc. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 3); + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + for (; i < size * 3; i++) /* Fill to extra bytes. */ + buf[i] = i + 1; + + buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 2); + for (i = 0; i < size * 2; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + + buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */ + KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR); + KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */ + + READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */ + KUNIT_ASSERT_TRUE(test, report_matches(&expect)); +} + +/* Test that some objects from a bulk allocation belong to KFENCE pool. */ +static void test_memcache_alloc_bulk(struct kunit *test) +{ + const size_t size = 32; + bool pass = false; + unsigned long timeout; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + void *objects[100]; + int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), + objects); + if (!num) + continue; + for (i = 0; i < ARRAY_SIZE(objects); i++) { + if (is_kfence_address(objects[i])) { + pass = true; + break; + } + } + kmem_cache_free_bulk(test_cache, num, objects); + /* + * kmem_cache_alloc_bulk() disables interrupts, and calling it + * in a tight loop may not give KFENCE a chance to switch the + * static branch. Call cond_resched() to let KFENCE chime in. + */ + cond_resched(); + } while (!pass && time_before(jiffies, timeout)); + + KUNIT_EXPECT_TRUE(test, pass); + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* + * KUnit does not provide a way to provide arguments to tests, and we encode + * additional info in the name. Set up 2 tests per test case, one using the + * default allocator, and another using a custom memcache (suffix '-memcache'). + */ +#define KFENCE_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name }, \ + { .run_case = test_name, .name = #test_name "-memcache" } + +static struct kunit_case kfence_test_cases[] = { + KFENCE_KUNIT_CASE(test_out_of_bounds_read), + KFENCE_KUNIT_CASE(test_out_of_bounds_write), + KFENCE_KUNIT_CASE(test_use_after_free_read), + KFENCE_KUNIT_CASE(test_double_free), + KFENCE_KUNIT_CASE(test_invalid_addr_free), + KFENCE_KUNIT_CASE(test_corruption), + KFENCE_KUNIT_CASE(test_free_bulk), + KFENCE_KUNIT_CASE(test_init_on_free), + KUNIT_CASE(test_kmalloc_aligned_oob_read), + KUNIT_CASE(test_kmalloc_aligned_oob_write), + KUNIT_CASE(test_shrink_memcache), + KUNIT_CASE(test_memcache_ctor), + KUNIT_CASE(test_invalid_access), + KUNIT_CASE(test_gfpzero), + KUNIT_CASE(test_memcache_typesafe_by_rcu), + KUNIT_CASE(test_krealloc), + KUNIT_CASE(test_memcache_alloc_bulk), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); i++) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + /* Any test with 'memcache' in its name will want a memcache. */ + if (strstr(test->name, "memcache")) + test->priv = TEST_PRIV_WANT_MEMCACHE; + else + test->priv = NULL; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + test_cache_destroy(); +} + +static struct kunit_suite kfence_test_suite = { + .name = "kfence", + .test_cases = kfence_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL }; + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kfence_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kfence_test_suites); +} + +static void kfence_test_exit(void) +{ + __kunit_test_suites_exit(kfence_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kfence_test_init); +module_exit(kfence_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Alexander Potapenko , Marco Elver "); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 4dbfa9a382e4..901bd7ee83d8 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -156,7 +156,12 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, pr_cont(" ]"); } -void kfence_report_error(unsigned long address, struct pt_regs *regs, +static const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; @@ -194,17 +199,19 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs, case KFENCE_ERROR_OOB: { const bool left_of_object = address < meta->addr; - pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", - (void *)address, + pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); break; } case KFENCE_ERROR_UAF: - pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n", - (void *)address, object_index); + pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n", + get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); @@ -213,8 +220,10 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs, pr_cont(" (in kfence-#%zd):\n", object_index); break; case KFENCE_ERROR_INVALID: - pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address); + pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write), + (void *)address); break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); From 0825c1d57f02e3fb228bbecad827956d4c796d3a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:35 -0800 Subject: [PATCH 453/506] MAINTAINERS: add entry for KFENCE Add entry for KFENCE maintainers. Link: https://lkml.kernel.org/r/20201103175841.3495947-10-elver@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Reviewed-by: SeongJae Park Co-developed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christopher Lameter Cc: Dave Hansen Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Joern Engel Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c71664ca8bfd..40040db747fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9867,6 +9867,18 @@ F: include/linux/keyctl.h F: include/uapi/linux/keyctl.h F: security/keys/ +KFENCE +M: Alexander Potapenko +M: Marco Elver +R: Dmitry Vyukov +L: kasan-dev@googlegroups.com +S: Maintained +F: Documentation/dev-tools/kfence.rst +F: arch/*/include/asm/kfence.h +F: include/linux/kfence.h +F: lib/Kconfig.kfence +F: mm/kfence/ + KFIFO M: Stefani Seibold S: Maintained From 35beccf0926d42ee0d56e41979ec8cdf814c4769 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 25 Feb 2021 17:19:40 -0800 Subject: [PATCH 454/506] kfence: report sensitive information based on no_hash_pointers We cannot rely on CONFIG_DEBUG_KERNEL to decide if we're running a "debug kernel" where we can safely show potentially sensitive information in the kernel log. Instead, simply rely on the newly introduced "no_hash_pointers" to print unhashed kernel pointers, as well as decide if our reports can include other potentially sensitive information such as registers and corrupted bytes. Link: https://lkml.kernel.org/r/20210223082043.1972742-1-elver@google.com Signed-off-by: Marco Elver Cc: Timur Tabi Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 8 ++++---- mm/kfence/core.c | 10 +++------- mm/kfence/kfence.h | 7 ------- mm/kfence/kfence_test.c | 2 +- mm/kfence/report.c | 18 ++++++++++-------- 5 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 58a0a5fa1ddc..fdf04e741ea5 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -88,8 +88,8 @@ A typical out-of-bounds access looks like this:: The header of the report provides a short summary of the function involved in the access. It is followed by more detailed information about the access and -its origin. Note that, real kernel addresses are only shown for -``CONFIG_DEBUG_KERNEL=y`` builds. +its origin. Note that, real kernel addresses are only shown when using the +kernel command line option ``no_hash_pointers``. Use-after-free accesses are reported as:: @@ -184,8 +184,8 @@ invalidly written bytes (offset from the address) are shown; in this representation, '.' denote untouched bytes. In the example above ``0xac`` is the value written to the invalid address at offset 0, and the remaining '.' denote that no following bytes have been touched. Note that, real values are -only shown for ``CONFIG_DEBUG_KERNEL=y`` builds; to avoid information -disclosure for non-debug builds, '!' is used instead to denote invalidly +only shown if the kernel was booted with ``no_hash_pointers``; to avoid +information disclosure otherwise, '!' is used instead to denote invalidly written bytes. And finally, KFENCE may also report on invalid accesses to any protected page diff --git a/mm/kfence/core.c b/mm/kfence/core.c index cfe3d32ac5b7..3b8ec938470a 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -646,13 +646,9 @@ void __init kfence_init(void) WRITE_ONCE(kfence_enabled, true); schedule_delayed_work(&kfence_timer, 0); - pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE, - CONFIG_KFENCE_NUM_OBJECTS); - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool, - (void *)(__kfence_pool + KFENCE_POOL_SIZE)); - else - pr_cont("\n"); + pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); } void kfence_shutdown_cache(struct kmem_cache *s) diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 1accc840dbbe..24065321ff8a 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -16,13 +16,6 @@ #include "../slab.h" /* for struct kmem_cache */ -/* For non-debug builds, avoid leaking kernel pointers into dmesg. */ -#ifdef CONFIG_DEBUG_KERNEL -#define PTR_FMT "%px" -#else -#define PTR_FMT "%p" -#endif - /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index db1bb596acaf..4acf4251ee04 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -146,7 +146,7 @@ static bool report_matches(const struct expect_report *r) break; } - cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr); + cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); spin_lock_irqsave(&observed.lock, flags); if (!report_available()) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 901bd7ee83d8..4a424de44e2d 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -19,6 +19,8 @@ #include "kfence.h" +extern bool no_hash_pointers; + /* Helper function to either print to a seq_file or to console. */ __printf(2, 3) static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) @@ -118,7 +120,7 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met } seq_con_printf(seq, - "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT + "kfence-#%zd [0x%p-0x%p" ", size=%d, cache=%s] allocated by task %d:\n", meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); @@ -148,7 +150,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, for (cur = (const u8 *)address; cur < end; cur++) { if (*cur == KFENCE_CANARY_PATTERN(cur)) pr_cont(" ."); - else if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + else if (no_hash_pointers) pr_cont(" 0x%02x", *cur); else /* Do not leak kernel memory in non-debug builds. */ pr_cont(" !"); @@ -201,7 +203,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n", + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); @@ -210,24 +212,24 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r case KFENCE_ERROR_UAF: pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n", + pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address); + pr_err("Corrupted memory at 0x%p ", (void *)address); print_diff_canary(address, 16, meta); pr_cont(" (in kfence-#%zd):\n", object_index); break; case KFENCE_ERROR_INVALID: pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write), + pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write), (void *)address); break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address, + pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, object_index); break; } @@ -242,7 +244,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* Print report footer. */ pr_err("\n"); - if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs) + if (no_hash_pointers && regs) show_regs(regs); else dump_stack_print_info(KERN_ERR); From 9c0dee54eb91d48cca048bd7bd2c1f4a166e0252 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:44 -0800 Subject: [PATCH 455/506] tracing: add error_report_end trace point Patch series "Add error_report_end tracepoint to KFENCE and KASAN", v3. This patchset adds a tracepoint, error_repor_end, that is to be used by KFENCE, KASAN, and potentially other bug detection tools, when they print an error report. One of the possible use cases is userspace collection of kernel error reports: interested parties can subscribe to the tracing event via tracefs, and get notified when an error report occurs. This patch (of 3): Introduce error_report_end tracepoint. It can be used in debugging tools like KASAN, KFENCE, etc. to provide extensions to the error reporting mechanisms (e.g. allow tests hook into error reporting, ease error report collection from production kernels). Another benefit would be making use of ftrace for debugging or benchmarking the tools themselves. Should we need it, the tracepoint name leaves us with the possibility to introduce a complementary error_report_start tracepoint in the future. Link: https://lkml.kernel.org/r/20210121131915.1331302-1-glider@google.com Link: https://lkml.kernel.org/r/20210121131915.1331302-2-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/error_report.h | 74 +++++++++++++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/error_report-traces.c | 11 +++++ 3 files changed, 86 insertions(+) create mode 100644 include/trace/events/error_report.h create mode 100644 kernel/trace/error_report-traces.c diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h new file mode 100644 index 000000000000..96f64bf218b2 --- /dev/null +++ b/include/trace/events/error_report.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Declarations for error reporting tracepoints. + * + * Copyright (C) 2021, Google LLC. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM error_report + +#if !defined(_TRACE_ERROR_REPORT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ERROR_REPORT_H + +#include + +#ifndef __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY +#define __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY + +enum error_detector { + ERROR_DETECTOR_KFENCE, + ERROR_DETECTOR_KASAN +}; + +#endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ + +#define error_detector_list \ + EM(ERROR_DETECTOR_KFENCE, "kfence") \ + EMe(ERROR_DETECTOR_KASAN, "kasan") +/* Always end the list with an EMe. */ + +#undef EM +#undef EMe + +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +error_detector_list + +#undef EM +#undef EMe + +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +#define show_error_detector_list(val) \ + __print_symbolic(val, error_detector_list) + +DECLARE_EVENT_CLASS(error_report_template, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id), + TP_STRUCT__entry(__field(enum error_detector, error_detector) + __field(unsigned long, id)), + TP_fast_assign(__entry->error_detector = error_detector; + __entry->id = id;), + TP_printk("[%s] %lx", + show_error_detector_list(__entry->error_detector), + __entry->id)); + +/** + * error_report_end - called after printing the error report + * @error_detector: short string describing the error detection tool + * @id: pseudo-unique descriptor identifying the report + * (e.g. the memory access address) + * + * This event occurs right after a debugging tool finishes printing the error + * report. + */ +DEFINE_EVENT(error_report_template, error_report_end, + TP_PROTO(enum error_detector error_detector, unsigned long id), + TP_ARGS(error_detector, id)); + +#endif /* _TRACE_ERROR_REPORT_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7e44cea89fdc..b28d3e5013cd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c new file mode 100644 index 000000000000..f89792c25b11 --- /dev/null +++ b/kernel/trace/error_report-traces.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Error reporting trace points. + * + * Copyright (C) 2021, Google LLC. + */ + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end); From f2b84d2e40eb1a17f72dc4a1da463ec8de649f19 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:47 -0800 Subject: [PATCH 456/506] kfence: use error_report_end tracepoint Make it possible to trace KFENCE error reporting. A good usecase is watching for trace events from the userspace to detect and process memory corruption reports from the kernel. Link: https://lkml.kernel.org/r/20210121131915.1331302-3-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/report.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 4a424de44e2d..ab83d5a59bb1 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -248,6 +249,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r show_regs(regs); else dump_stack_print_info(KERN_ERR); + trace_error_report_end(ERROR_DETECTOR_KFENCE, address); pr_err("==================================================================\n"); lockdep_on(); From d3a61f745e0d089a2484740283a434deb6dd4eb5 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 25 Feb 2021 17:19:51 -0800 Subject: [PATCH 457/506] kasan: use error_report_end tracepoint Make it possible to trace KASAN error reporting. A good usecase is watching for trace events from the userspace to detect and process memory corruption reports from the kernel. Link: https://lkml.kernel.org/r/20210121131915.1331302-4-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 234f35a84f19..87b271206163 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -84,8 +85,9 @@ static void start_report(unsigned long *flags) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags) +static void end_report(unsigned long *flags, unsigned long addr) { + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -355,7 +357,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) print_address_description(object, tag); pr_err("\n"); print_memory_metadata(object); - end_report(&flags); + end_report(&flags, (unsigned long)object); } static void __kasan_report(unsigned long addr, size_t size, bool is_write, @@ -401,7 +403,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, dump_stack(); } - end_report(&flags); + end_report(&flags, addr); } bool kasan_report(unsigned long addr, size_t size, bool is_write, From 928501344fc645f80390afc12708c81b3595745d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:19:55 -0800 Subject: [PATCH 458/506] kasan, mm: don't save alloc stacks twice Patch series "kasan: optimizations and fixes for HW_TAGS", v4. This patchset makes the HW_TAGS mode more efficient, mostly by reworking poisoning approaches and simplifying/inlining some internal helpers. With this change, the overhead of HW_TAGS annotations excluding setting and checking memory tags is ~3%. The performance impact caused by tags will be unknown until we have hardware that supports MTE. As a side-effect, this patchset speeds up generic KASAN by ~15%. This patch (of 13): Currently KASAN saves allocation stacks in both kasan_slab_alloc() and kasan_kmalloc() annotations. This patch changes KASAN to save allocation stacks for slab objects from kmalloc caches in kasan_kmalloc() only, and stacks for other slab objects in kasan_slab_alloc() only. This change requires ____kasan_kmalloc() knowing whether the object belongs to a kmalloc cache. This is implemented by adding a flag field to the kasan_info structure. That flag is only set for kmalloc caches via a new kasan_cache_create_kmalloc() annotation. Link: https://lkml.kernel.org/r/cover.1612546384.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/7c673ebca8d00f40a7ad6f04ab9a2bddeeae2097.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 9 +++++++++ mm/kasan/common.c | 18 ++++++++++++++---- mm/slab_common.c | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7eaf2d9effb4..3fb31e8a353e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -83,6 +83,7 @@ static inline void kasan_disable_current(void) {} struct kasan_cache { int alloc_meta_offset; int free_meta_offset; + bool is_kmalloc; }; #ifdef CONFIG_KASAN_HW_TAGS @@ -143,6 +144,13 @@ static __always_inline void kasan_cache_create(struct kmem_cache *cache, __kasan_cache_create(cache, size, flags); } +void __kasan_cache_create_kmalloc(struct kmem_cache *cache); +static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + if (kasan_enabled()) + __kasan_cache_create_kmalloc(cache); +} + size_t __kasan_metadata_size(struct kmem_cache *cache); static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) { @@ -278,6 +286,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} +static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index af1768c4fee5..d8d83ca56fe2 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -210,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, *size = optimal_size; } +void __kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + cache->kasan_info.is_kmalloc = true; +} + size_t __kasan_metadata_size(struct kmem_cache *cache) { if (!kasan_stack_collection_enabled()) @@ -394,17 +399,22 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +static void set_alloc_info(struct kmem_cache *cache, void *object, + gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; + /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ + if (cache->kasan_info.is_kmalloc && !is_kmalloc) + return; + alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); } static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool keep_tag) + size_t size, gfp_t flags, bool is_kmalloc) { unsigned long redzone_start; unsigned long redzone_end; @@ -423,7 +433,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_GRANULE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, KASAN_GRANULE_SIZE); - tag = assign_tag(cache, object, false, keep_tag); + tag = assign_tag(cache, object, false, is_kmalloc); /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ kasan_unpoison(set_tag(object, tag), size); @@ -431,7 +441,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_KMALLOC_REDZONE); if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags); + set_alloc_info(cache, (void *)object, flags, is_kmalloc); return set_tag(object, tag); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 284954ef1da5..897c3a446b04 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -643,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, panic("Out of memory when creating slab %s\n", name); create_boot_cache(s, name, size, flags, useroffset, usersize); + kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; From e2db1a9aa3814960a56583df39ea71e36d802278 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:19:59 -0800 Subject: [PATCH 459/506] kasan, mm: optimize kmalloc poisoning For allocations from kmalloc caches, kasan_kmalloc() always follows kasan_slab_alloc(). Currenly, both of them unpoison the whole object, which is unnecessary. This patch provides separate implementations for both annotations: kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc() only poisons the redzone. For generic KASAN, the redzone start might not be aligned to KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts: kasan_poison_last_granule() poisons the unaligned part, and then kasan_poison() poisons the rest. This patch also clarifies alignment guarantees of each of the poisoning functions and drops the unnecessary round_up() call for redzone_end. With this change, the early SLUB cache annotation needs to be changed to kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now. The number of poisoned bytes for objects in this cache stays the same, as kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node). Link: https://lkml.kernel.org/r/7e3961cb52be380bc412860332063f5f7ce10d13.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 93 +++++++++++++++++++++++++++++++---------------- mm/kasan/kasan.h | 43 +++++++++++++++++++++- mm/kasan/shadow.c | 28 +++++++------- mm/slub.c | 3 +- 4 files changed, 119 insertions(+), 48 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d8d83ca56fe2..218b23a5a597 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * based on objects indexes, so that objects that are next to each other * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, - bool init, bool keep_tag) +static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return 0xff; - /* - * 1. When an object is kmalloc()'ed, two hooks are called: - * kasan_slab_alloc() and kasan_kmalloc(). We assign the - * tag only in the first one. - * 2. We reuse the same tag for krealloc'ed objects. - */ - if (keep_tag) - return get_tag(object); - /* * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU * set, assign a tag when the object is being allocated (init == false). @@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, } /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ - object = set_tag(object, assign_tag(cache, object, true, false)); + object = set_tag(object, assign_tag(cache, object, true)); return (void *)object; } @@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, kasan_set_track(&alloc_meta->alloc_track, flags); } +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, + void *object, gfp_t flags) +{ + u8 tag; + void *tagged_object; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(object == NULL)) + return NULL; + + if (is_kfence_address(object)) + return (void *)object; + + /* + * Generate and assign random tag for tag-based modes. + * Tag is ignored in set_tag() for the generic mode. + */ + tag = assign_tag(cache, object, false); + tagged_object = set_tag(object, tag); + + /* + * Unpoison the whole object. + * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. + */ + kasan_unpoison(tagged_object, cache->object_size); + + /* Save alloc info (if possible) for non-kmalloc() allocations. */ + if (kasan_stack_collection_enabled()) + set_alloc_info(cache, (void *)object, flags, false); + + return tagged_object; +} + static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool is_kmalloc) + size_t size, gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; - u8 tag; if (gfpflags_allow_blocking(flags)) kasan_quarantine_reduce(); @@ -429,33 +453,41 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, if (is_kfence_address(kasan_reset_tag(object))) return (void *)object; + /* + * The object has already been unpoisoned by kasan_slab_alloc() for + * kmalloc() or by ksize() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule((void *)object, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); - redzone_end = round_up((unsigned long)object + cache->object_size, - KASAN_GRANULE_SIZE); - tag = assign_tag(cache, object, false, is_kmalloc); - - /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ - kasan_unpoison(set_tag(object, tag), size); + redzone_end = (unsigned long)object + cache->object_size; kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); + /* + * Save alloc info (if possible) for kmalloc() allocations. + * This also rewrites the alloc info when called from kasan_krealloc(). + */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, is_kmalloc); + set_alloc_info(cache, (void *)object, flags, true); - return set_tag(object, tag); -} - -void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) -{ - return ____kasan_kmalloc(cache, object, cache->object_size, flags, false); + /* Keep the tag that was set by kasan_slab_alloc(). */ + return (void *)object; } void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags) { - return ____kasan_kmalloc(cache, object, size, flags, true); + return ____kasan_kmalloc(cache, object, size, flags); } EXPORT_SYMBOL(__kasan_kmalloc); @@ -496,8 +528,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag if (unlikely(!PageSlab(page))) return __kasan_kmalloc_large(object, size, flags); else - return ____kasan_kmalloc(page->slab_cache, object, size, - flags, true); + return ____kasan_kmalloc(page->slab_cache, object, size, flags); } void __kasan_kfree_large(void *ptr, unsigned long ip) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb883740fd27..222858e2e6af 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -367,12 +367,51 @@ static inline bool kasan_byte_accessible(const void *addr) #else /* CONFIG_KASAN_HW_TAGS */ -void kasan_poison(const void *address, size_t size, u8 value); -void kasan_unpoison(const void *address, size_t size); +/** + * kasan_poison - mark the memory range as unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * @value - value that's written to metadata for the range + * + * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. + */ +void kasan_poison(const void *addr, size_t size, u8 value); + +/** + * kasan_unpoison - mark the memory range as accessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before + * marking the range. + * For the generic mode, the last granule of the memory range gets partially + * unpoisoned based on the @size. + */ +void kasan_unpoison(const void *addr, size_t size); + bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC + +/** + * kasan_poison_last_granule - mark the last granule of the memory range as + * unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * This function is only available for the generic mode, as it's the only mode + * that has partially poisoned memory granules. + */ +void kasan_poison_last_granule(const void *address, size_t size); + +#else /* CONFIG_KASAN_GENERIC */ + +static inline void kasan_poison_last_granule(const void *address, size_t size) { } + +#endif /* CONFIG_KASAN_GENERIC */ + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 1372a2fc0ca9..1ed7817e4ee6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,10 +69,6 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -/* - * Poisons the shadow memory for 'size' bytes starting from 'addr'. - * Memory addresses should be aligned to KASAN_GRANULE_SIZE. - */ void kasan_poison(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -83,12 +79,12 @@ void kasan_poison(const void *address, size_t size, u8 value) * addresses to this function. */ address = kasan_reset_tag(address); - size = round_up(size, KASAN_GRANULE_SIZE); /* Skip KFENCE memory if called explicitly outside of sl*b. */ if (is_kfence_address(address)) return; + size = round_up(size, KASAN_GRANULE_SIZE); shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); @@ -96,6 +92,16 @@ void kasan_poison(const void *address, size_t size, u8 value) } EXPORT_SYMBOL(kasan_poison); +#ifdef CONFIG_KASAN_GENERIC +void kasan_poison_last_granule(const void *address, size_t size) +{ + if (size & KASAN_GRANULE_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + *shadow = size & KASAN_GRANULE_MASK; + } +} +#endif + void kasan_unpoison(const void *address, size_t size) { u8 tag = get_tag(address); @@ -115,16 +121,12 @@ void kasan_unpoison(const void *address, size_t size) if (is_kfence_address(address)) return; + /* Unpoison round_up(size, KASAN_GRANULE_SIZE) bytes. */ kasan_poison(address, size, tag); - if (size & KASAN_GRANULE_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); - - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - *shadow = tag; - else /* CONFIG_KASAN_GENERIC */ - *shadow = size & KASAN_GRANULE_MASK; - } + /* Partially poison the last granule for the generic mode. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(address, size); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/slub.c b/mm/slub.c index 383616af28c4..e26c274b4657 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3579,8 +3579,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; From 43a219cbe5a46ec3f6a1874bb2cb2fd4de8322cc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:03 -0800 Subject: [PATCH 460/506] kasan: optimize large kmalloc poisoning Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need to unpoison the object as it as already unpoisoned by alloc_pages() (or by ksize() for krealloc()). This patch changes kasan_kmalloc_large() to only poison the redzone. Link: https://lkml.kernel.org/r/33dee5aac0e550ad7f8e26f590c9b02c6129b4a3.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 218b23a5a597..dcdc92948364 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc); void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { - struct page *page; unsigned long redzone_start; unsigned long redzone_end; @@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, if (unlikely(ptr == NULL)) return NULL; - page = virt_to_page(ptr); + /* + * The object has already been unpoisoned by kasan_alloc_pages() for + * alloc_pages() or by ksize() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(ptr, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(ptr + size), KASAN_GRANULE_SIZE); - redzone_end = (unsigned long)ptr + page_size(page); - - kasan_unpoison(ptr, size); + redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); From df54b383124cf3e09f66644ee8a2eb977e8c7f26 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:07 -0800 Subject: [PATCH 461/506] kasan: clean up setting free info in kasan_slab_free Put kasan_stack_collection_enabled() check and kasan_set_free_info() calls next to each other. The way this was previously implemented was a minor optimization that relied of the the fact that kasan_stack_collection_enabled() is always true for generic KASAN. The confusion that this brings outweights saving a few instructions. Link: https://lkml.kernel.org/r/f838e249be5ab5810bf54a36ef5072cfd80e2da7.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index dcdc92948364..48d51daeda95 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -350,13 +350,11 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); - if (!kasan_stack_collection_enabled()) - return false; - if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; - kasan_set_free_info(cache, object, tag); + if (kasan_stack_collection_enabled()) + kasan_set_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } From 200072ce33b298cf14d3ed2a570f5eb27609677d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:11 -0800 Subject: [PATCH 462/506] kasan: unify large kfree checks Unify checks in kasan_kfree_large() and in kasan_slab_free_mempool() for large allocations as it's done for small kfree() allocations. With this change, kasan_slab_free_mempool() starts checking that the first byte of the memory that's being freed is accessible. Link: https://lkml.kernel.org/r/14ffc4cd867e0b1ed58f7527e3b748a1b4ad08aa.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 16 ++++++++-------- mm/kasan/common.c | 36 ++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 3fb31e8a353e..b91732bd05d7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,6 +200,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) return false; } +void __kasan_kfree_large(void *ptr, unsigned long ip); +static __always_inline void kasan_kfree_large(void *ptr) +{ + if (kasan_enabled()) + __kasan_kfree_large(ptr, _RET_IP_); +} + void __kasan_slab_free_mempool(void *ptr, unsigned long ip); static __always_inline void kasan_slab_free_mempool(void *ptr) { @@ -247,13 +254,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } -void __kasan_kfree_large(void *ptr, unsigned long ip); -static __always_inline void kasan_kfree_large(void *ptr) -{ - if (kasan_enabled()) - __kasan_kfree_large(ptr, _RET_IP_); -} - /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -302,6 +302,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object) { return false; } +static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) @@ -322,7 +323,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_kfree_large(void *ptr) {} static inline bool kasan_check_byte(const void *address) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 48d51daeda95..8a3d66393dc5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -364,6 +364,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return ____kasan_slab_free(cache, object, ip, true); } +static bool ____kasan_kfree_large(void *ptr, unsigned long ip) +{ + if (ptr != page_address(virt_to_head_page(ptr))) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + if (!kasan_byte_accessible(ptr)) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + /* + * The object will be poisoned by kasan_free_pages() or + * kasan_slab_free_mempool(). + */ + + return false; +} + +void __kasan_kfree_large(void *ptr, unsigned long ip) +{ + ____kasan_kfree_large(ptr, ip); +} + void __kasan_slab_free_mempool(void *ptr, unsigned long ip) { struct page *page; @@ -377,10 +402,8 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. */ if (unlikely(!PageSlab(page))) { - if (ptr != page_address(page)) { - kasan_report_invalid_free(ptr, ip); + if (____kasan_kfree_large(ptr, ip)) return; - } kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); @@ -539,13 +562,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag return ____kasan_kmalloc(page->slab_cache, object, size, flags); } -void __kasan_kfree_large(void *ptr, unsigned long ip) -{ - if (ptr != page_address(virt_to_head_page(ptr))) - kasan_report_invalid_free(ptr, ip); - /* The object will be poisoned by kasan_free_pages(). */ -} - bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { From b87c28b9a7ef64590943435ea59f40092f2376d5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:15 -0800 Subject: [PATCH 463/506] kasan: rework krealloc tests This patch reworks KASAN-KUnit tests for krealloc() to: 1. Check both slab and page_alloc based krealloc() implementations. 2. Allow at least one full granule to fit between old and new sizes for each KASAN mode, and check accesses to that granule accordingly. Link: https://lkml.kernel.org/r/c707f128a2bb9f2f05185d1eb52192cf179cf4fa.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 91 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 10 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 25576303897b..e1bd1d1096de 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -252,11 +252,14 @@ static void kmalloc_large_oob_right(struct kunit *test) kfree(ptr); } -static void kmalloc_oob_krealloc_more(struct kunit *test) +static void krealloc_more_oob_helper(struct kunit *test, + size_t size1, size_t size2) { char *ptr1, *ptr2; - size_t size1 = 17; - size_t size2 = 19; + size_t middle; + + KUNIT_ASSERT_LT(test, size1, size2); + middle = size1 + (size2 - size1) / 2; ptr1 = kmalloc(size1, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -264,15 +267,31 @@ static void kmalloc_oob_krealloc_more(struct kunit *test) ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); + /* All offsets up to size2 must be accessible. */ + ptr2[size1 - 1] = 'x'; + ptr2[size1] = 'x'; + ptr2[middle] = 'x'; + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + kfree(ptr2); } -static void kmalloc_oob_krealloc_less(struct kunit *test) +static void krealloc_less_oob_helper(struct kunit *test, + size_t size1, size_t size2) { char *ptr1, *ptr2; - size_t size1 = 17; - size_t size2 = 15; + size_t middle; + + KUNIT_ASSERT_LT(test, size2, size1); + middle = size2 + (size1 - size2) / 2; ptr1 = kmalloc(size1, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -280,10 +299,60 @@ static void kmalloc_oob_krealloc_less(struct kunit *test) ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2 + OOB_TAG_OFF] = 'x'); + /* Must be accessible for all modes. */ + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + /* + * For all modes all size2, middle, and size1 should land in separate + * granules and thus the latter two offsets should be inaccessible. + */ + KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE), + round_down(middle, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE), + round_down(size1, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x'); + kfree(ptr2); } +static void krealloc_more_oob(struct kunit *test) +{ + krealloc_more_oob_helper(test, 201, 235); +} + +static void krealloc_less_oob(struct kunit *test) +{ + krealloc_less_oob_helper(test, 235, 201); +} + +static void krealloc_pagealloc_more_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201, + KMALLOC_MAX_CACHE_SIZE + 235); +} + +static void krealloc_pagealloc_less_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235, + KMALLOC_MAX_CACHE_SIZE + 201); +} + static void kmalloc_oob_16(struct kunit *test) { struct { @@ -977,8 +1046,10 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(pagealloc_oob_right), KUNIT_CASE(pagealloc_uaf), KUNIT_CASE(kmalloc_large_oob_right), - KUNIT_CASE(kmalloc_oob_krealloc_more), - KUNIT_CASE(kmalloc_oob_krealloc_less), + KUNIT_CASE(krealloc_more_oob), + KUNIT_CASE(krealloc_less_oob), + KUNIT_CASE(krealloc_pagealloc_more_oob), + KUNIT_CASE(krealloc_pagealloc_less_oob), KUNIT_CASE(kmalloc_oob_16), KUNIT_CASE(kmalloc_uaf_16), KUNIT_CASE(kmalloc_oob_in_memset), From 26a5ca7a73be31f76c291465680517cde37051ca Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:19 -0800 Subject: [PATCH 464/506] kasan, mm: fail krealloc on freed objects Currently, if krealloc() is called on a freed object with KASAN enabled, it allocates and returns a new object, but doesn't copy any memory from the old one as ksize() returns 0. This makes the caller believe that krealloc() succeeded (KASAN report is printed though). This patch adds an accessibility check into __do_krealloc(). If the check fails, krealloc() returns NULL. This check duplicates the one in ksize(); this is fixed in the following patch. This patch also adds a KASAN-KUnit test to check krealloc() behaviour when it's called on a freed object. Link: https://lkml.kernel.org/r/cbcf7b02be0a1ca11de4f833f2ff0b3f2c9b00c8.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 20 ++++++++++++++++++++ mm/slab_common.c | 3 +++ 2 files changed, 23 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e1bd1d1096de..e5647d147b35 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -353,6 +353,25 @@ static void krealloc_pagealloc_less_oob(struct kunit *test) KMALLOC_MAX_CACHE_SIZE + 201); } +/* + * Check that krealloc() detects a use-after-free, returns NULL, + * and doesn't unpoison the freed object. + */ +static void krealloc_uaf(struct kunit *test) +{ + char *ptr1, *ptr2; + int size1 = 201; + int size2 = 235; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); + KUNIT_ASSERT_PTR_EQ(test, (void *)ptr2, NULL); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); +} + static void kmalloc_oob_16(struct kunit *test) { struct { @@ -1050,6 +1069,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(krealloc_less_oob), KUNIT_CASE(krealloc_pagealloc_more_oob), KUNIT_CASE(krealloc_pagealloc_less_oob), + KUNIT_CASE(krealloc_uaf), KUNIT_CASE(kmalloc_oob_16), KUNIT_CASE(kmalloc_uaf_16), KUNIT_CASE(kmalloc_oob_in_memset), diff --git a/mm/slab_common.c b/mm/slab_common.c index 897c3a446b04..4aedb8455352 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1136,6 +1136,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, void *ret; size_t ks; + if (likely(!ZERO_OR_NULL_PTR(p)) && !kasan_check_byte(p)) + return NULL; + ks = ksize(p); if (ks >= new_size) { From d12d9ad816299052385bac351fad338a073121b3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:23 -0800 Subject: [PATCH 465/506] kasan, mm: optimize krealloc poisoning Currently, krealloc() always calls ksize(), which unpoisons the whole object including the redzone. This is inefficient, as kasan_krealloc() repoisons the redzone for objects that fit into the same buffer. This patch changes krealloc() instrumentation to use uninstrumented __ksize() that doesn't unpoison the memory. Instead, kasan_kreallos() is changed to unpoison the memory excluding the redzone. For objects that don't fit into the old allocation, this patch disables KASAN accessibility checks when copying memory into a new object instead of unpoisoning it. Link: https://lkml.kernel.org/r/9bef90327c9cb109d736c40115684fd32f49e6b0.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 12 ++++++++++-- mm/slab_common.c | 20 ++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8a3d66393dc5..1e510649833b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -476,7 +476,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, /* * The object has already been unpoisoned by kasan_slab_alloc() for - * kmalloc() or by ksize() for krealloc(). + * kmalloc() or by kasan_krealloc() for krealloc(). */ /* @@ -526,7 +526,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, /* * The object has already been unpoisoned by kasan_alloc_pages() for - * alloc_pages() or by ksize() for krealloc(). + * alloc_pages() or by kasan_krealloc() for krealloc(). */ /* @@ -554,8 +554,16 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag if (unlikely(object == ZERO_SIZE_PTR)) return (void *)object; + /* + * Unpoison the object's data. + * Part of it might already have been unpoisoned, but it's unknown + * how big that part is. + */ + kasan_unpoison(object, size); + page = virt_to_head_page(object); + /* Piggy-back on kmalloc() instrumentation to poison the redzone. */ if (unlikely(!PageSlab(page))) return __kasan_kmalloc_large(object, size, flags); else diff --git a/mm/slab_common.c b/mm/slab_common.c index 4aedb8455352..88e833986332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1136,19 +1136,27 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, void *ret; size_t ks; - if (likely(!ZERO_OR_NULL_PTR(p)) && !kasan_check_byte(p)) - return NULL; - - ks = ksize(p); + /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + if (likely(!ZERO_OR_NULL_PTR(p))) { + if (!kasan_check_byte(p)) + return NULL; + ks = kfence_ksize(p) ?: __ksize(p); + } else + ks = 0; + /* If the object still fits, repoison it precisely. */ if (ks >= new_size) { p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), ks); + kasan_enable_current(); + } return ret; } From cde8a7eb778c7c71f70d636aa0bb1ec081b9167c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:27 -0800 Subject: [PATCH 466/506] kasan: ensure poisoning size alignment A previous changes d99f6a10c161 ("kasan: don't round_up too much") attempted to simplify the code by adding a round_up(size) call into kasan_poison(). While this allows to have less round_up() calls around the code, this results in round_up() being called multiple times. This patch removes round_up() of size from kasan_poison() and ensures that all callers round_up() the size explicitly. This patch also adds WARN_ON() alignment checks for address and size to kasan_poison() and kasan_unpoison(). Link: https://lkml.kernel.org/r/3ffe8d4a246ae67a8b5e91f65bf98cd7cba9d7b9.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 9 ++++++--- mm/kasan/kasan.h | 33 ++++++++++++++++++++------------- mm/kasan/shadow.c | 37 ++++++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1e510649833b..dec7375fb884 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -261,7 +261,8 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE); + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_REDZONE); } /* @@ -348,7 +349,8 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return true; } - kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_FREE); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -490,7 +492,8 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); - redzone_end = (unsigned long)object + cache->object_size; + redzone_end = round_up((unsigned long)(object + cache->object_size), + KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 222858e2e6af..8c55634d6edd 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -330,30 +330,37 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *address, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value) { - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - hw_set_mem_tag_range((void *)address, - round_up(size, KASAN_GRANULE_SIZE), value); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + hw_set_mem_tag_range((void *)addr, size, value); } -static inline void kasan_unpoison(const void *address, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size) { - u8 tag = get_tag(address); + u8 tag = get_tag(addr); - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - hw_set_mem_tag_range((void *)address, - round_up(size, KASAN_GRANULE_SIZE), tag); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + size = round_up(size, KASAN_GRANULE_SIZE); + + hw_set_mem_tag_range((void *)addr, size, tag); } static inline bool kasan_byte_accessible(const void *addr) @@ -370,7 +377,7 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as unaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range * * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. @@ -380,7 +387,7 @@ void kasan_poison(const void *addr, size_t size, u8 value); /** * kasan_unpoison - mark the memory range as accessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @size - range size, can be unaligned * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 1ed7817e4ee6..63f43443f5d7 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -void kasan_poison(const void *address, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -78,55 +78,62 @@ void kasan_poison(const void *address, size_t size, u8 value) * some of the callers (e.g. kasan_poison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* Skip KFENCE memory if called explicitly outside of sl*b. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - size = round_up(size, KASAN_GRANULE_SIZE); - shadow_start = kasan_mem_to_shadow(address); - shadow_end = kasan_mem_to_shadow(address + size); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + shadow_start = kasan_mem_to_shadow(addr); + shadow_end = kasan_mem_to_shadow(addr + size); __memset(shadow_start, value, shadow_end - shadow_start); } EXPORT_SYMBOL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC -void kasan_poison_last_granule(const void *address, size_t size) +void kasan_poison_last_granule(const void *addr, size_t size) { if (size & KASAN_GRANULE_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size); *shadow = size & KASAN_GRANULE_MASK; } } #endif -void kasan_unpoison(const void *address, size_t size) +void kasan_unpoison(const void *addr, size_t size) { - u8 tag = get_tag(address); + u8 tag = get_tag(addr); /* * Perform shadow offset calculation based on untagged address, as * some of the callers (e.g. kasan_unpoison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); /* * Skip KFENCE memory if called explicitly outside of sl*b. Also note * that calls to ksize(), where size is not a multiple of machine-word * size, would otherwise poison the invalid portion of the word. */ - if (is_kfence_address(address)) + if (is_kfence_address(addr)) return; - /* Unpoison round_up(size, KASAN_GRANULE_SIZE) bytes. */ - kasan_poison(address, size, tag); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + + /* Unpoison all granules that cover the object. */ + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); /* Partially poison the last granule for the generic mode. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) - kasan_poison_last_granule(address, size); + kasan_poison_last_granule(addr, size); } #ifdef CONFIG_MEMORY_HOTPLUG From 2cb34276427a093e2d7cc6ea63ac447bad1ff4c1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:31 -0800 Subject: [PATCH 467/506] arm64: kasan: simplify and inline MTE functions This change provides a simpler implementation of mte_get_mem_tag(), mte_get_random_tag(), and mte_set_mem_tag_range(). Simplifications include removing system_supports_mte() checks as these functions are onlye called from KASAN runtime that had already checked system_supports_mte(). Besides that, size and address alignment checks are removed from mte_set_mem_tag_range(), as KASAN now does those. This change also moves these functions into the asm/mte-kasan.h header and implements mte_set_mem_tag_range() via inline assembly to avoid unnecessary functions calls. [vincenzo.frascino@arm.com: fix warning in mte_get_random_tag()] Link: https://lkml.kernel.org/r/20210211152208.23811-1-vincenzo.frascino@arm.com Link: https://lkml.kernel.org/r/a26121b294fdf76e369cb7a74351d1c03a908930.1612546384.git.andreyknvl@google.com Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Signed-off-by: Andrey Konovalov Reviewed-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Marco Elver Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/cache.h | 1 - arch/arm64/include/asm/kasan.h | 1 + arch/arm64/include/asm/mte-def.h | 2 + arch/arm64/include/asm/mte-kasan.h | 67 ++++++++++++++++++++++++++---- arch/arm64/include/asm/mte.h | 2 - arch/arm64/kernel/mte.c | 46 -------------------- arch/arm64/lib/mte.S | 16 ------- 7 files changed, 61 insertions(+), 74 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 77cbbe3625f2..a074459f8f2f 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -6,7 +6,6 @@ #define __ASM_CACHE_H #include -#include #define CTR_L1IP_SHIFT 14 #define CTR_L1IP_MASK 3 diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 0aaf9044cd6a..12d5f47f7dbe 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -6,6 +6,7 @@ #include #include +#include #include #define arch_kasan_set_tag(addr, tag) __tag_set(addr, tag) diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h index 2d73a1612f09..cf241b0f0a42 100644 --- a/arch/arm64/include/asm/mte-def.h +++ b/arch/arm64/include/asm/mte-def.h @@ -11,4 +11,6 @@ #define MTE_TAG_SIZE 4 #define MTE_TAG_MASK GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT) +#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n" + #endif /* __ASM_MTE_DEF_H */ diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 3748d5bb88c0..7ab500e2ad17 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -11,12 +11,15 @@ #include -/* - * The functions below are meant to be used only for the - * KASAN_HW_TAGS interface defined in asm/memory.h. - */ #ifdef CONFIG_ARM64_MTE +/* + * These functions are meant to be only used from KASAN runtime through + * the arch_*() interface defined in asm/memory.h. + * These functions don't include system_supports_mte() checks, + * as KASAN only calls them when MTE is supported and enabled. + */ + static inline u8 mte_get_ptr_tag(void *ptr) { /* Note: The format of KASAN tags is 0xF */ @@ -25,9 +28,54 @@ static inline u8 mte_get_ptr_tag(void *ptr) return tag; } -u8 mte_get_mem_tag(void *addr); -u8 mte_get_random_tag(void); -void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag); +/* Get allocation tag for the address. */ +static inline u8 mte_get_mem_tag(void *addr) +{ + asm(__MTE_PREAMBLE "ldg %0, [%0]" + : "+r" (addr)); + + return mte_get_ptr_tag(addr); +} + +/* Generate a random tag. */ +static inline u8 mte_get_random_tag(void) +{ + void *addr; + + asm(__MTE_PREAMBLE "irg %0, %0" + : "=r" (addr)); + + return mte_get_ptr_tag(addr); +} + +/* + * Assign allocation tags for a region of memory based on the pointer tag. + * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and + * size must be non-zero and MTE_GRANULE_SIZE aligned. + */ +static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) +{ + u64 curr, end; + + if (!size) + return; + + curr = (u64)__tag_set(addr, tag); + end = curr + size; + + do { + /* + * 'asm volatile' is required to prevent the compiler to move + * the statement outside of the loop. + */ + asm volatile(__MTE_PREAMBLE "stg %0, [%0]" + : + : "r" (curr) + : "memory"); + + curr += MTE_GRANULE_SIZE; + } while (curr != end); +} void mte_enable_kernel(void); void mte_init_tags(u64 max_tag); @@ -46,13 +94,14 @@ static inline u8 mte_get_mem_tag(void *addr) { return 0xFF; } + static inline u8 mte_get_random_tag(void) { return 0xFF; } -static inline void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag) + +static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag) { - return addr; } static inline void mte_enable_kernel(void) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index d02aff9f493d..9b557a457f24 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -8,8 +8,6 @@ #include #include -#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n" - #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 2cfc850809ce..b3c70a612c7a 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -88,51 +87,6 @@ int memcmp_pages(struct page *page1, struct page *page2) return ret; } -u8 mte_get_mem_tag(void *addr) -{ - if (!system_supports_mte()) - return 0xFF; - - asm(__MTE_PREAMBLE "ldg %0, [%0]" - : "+r" (addr)); - - return mte_get_ptr_tag(addr); -} - -u8 mte_get_random_tag(void) -{ - void *addr; - - if (!system_supports_mte()) - return 0xFF; - - asm(__MTE_PREAMBLE "irg %0, %0" - : "+r" (addr)); - - return mte_get_ptr_tag(addr); -} - -void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag) -{ - void *ptr = addr; - - if ((!system_supports_mte()) || (size == 0)) - return addr; - - /* Make sure that size is MTE granule aligned. */ - WARN_ON(size & (MTE_GRANULE_SIZE - 1)); - - /* Make sure that the address is MTE granule aligned. */ - WARN_ON((u64)addr & (MTE_GRANULE_SIZE - 1)); - - tag = 0xF0 | tag; - ptr = (void *)__tag_set(ptr, tag); - - mte_assign_mem_tag_range(ptr, size); - - return ptr; -} - void mte_init_tags(u64 max_tag) { static bool gcr_kernel_excl_initialized; diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 9e1a12e10053..351537c12f36 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -149,19 +149,3 @@ SYM_FUNC_START(mte_restore_page_tags) ret SYM_FUNC_END(mte_restore_page_tags) - -/* - * Assign allocation tags for a region of memory based on the pointer tag - * x0 - source pointer - * x1 - size - * - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and - * size must be non-zero and MTE_GRANULE_SIZE aligned. - */ -SYM_FUNC_START(mte_assign_mem_tag_range) -1: stg x0, [x0] - add x0, x0, #MTE_GRANULE_SIZE - subs x1, x1, #MTE_GRANULE_SIZE - b.gt 1b - ret -SYM_FUNC_END(mte_assign_mem_tag_range) From c80a03664e154b7263af1c4dd53f42221d0c8283 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:35 -0800 Subject: [PATCH 468/506] kasan: inline HW_TAGS helper functions Mark all static functions in common.c and kasan.h that are used for hardware tag-based KASAN as inline to avoid unnecessary function calls. Link: https://lkml.kernel.org/r/2c94a2af0657f2b95b9337232339ff5ffa643ab5.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index dec7375fb884..b5e08d4cefec 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -279,7 +279,8 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * based on objects indexes, so that objects that are next to each other * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) +static inline u8 assign_tag(struct kmem_cache *cache, + const void *object, bool init) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return 0xff; @@ -321,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, + void *object, unsigned long ip, bool quarantine) { u8 tag; void *tagged_object; @@ -366,7 +367,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return ____kasan_slab_free(cache, object, ip, true); } -static bool ____kasan_kfree_large(void *ptr, unsigned long ip) +static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip); @@ -461,8 +462,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, return tagged_object; } -static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags) +static inline void *____kasan_kmalloc(struct kmem_cache *cache, + const void *object, size_t size, gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; From 7169487bc2a7c5732a6eeebc6dc3d1351d4a6350 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 25 Feb 2021 17:20:38 -0800 Subject: [PATCH 469/506] kasan: clarify that only first bug is reported in HW_TAGS Hwardware tag-based KASAN only reports the first found bug. After that MTE tag checking gets disabled. Clarify this in comments and documentation. Link: https://lkml.kernel.org/r/00383ba88a47c3f8342d12263c24bdf95527b07d.1612546384.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 8 ++++++-- mm/kasan/hw_tags.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index cde14aeefca7..ddf4239a5890 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -155,7 +155,7 @@ Boot parameters ~~~~~~~~~~~~~~~ Hardware tag-based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore it supports +intended for use in production as a security mitigation. Therefore, it supports boot parameters that allow to disable KASAN competely or otherwise control particular KASAN features. @@ -165,7 +165,8 @@ particular KASAN features. traces collection (default: ``on``). - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). + report or also panic the kernel (default: ``report``). Note, that tag + checking gets disabled after the first reported bug. For developers ~~~~~~~~~~~~~~ @@ -295,6 +296,9 @@ Note, that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being enabled. Even when kasan.mode=off is provided, or when the hardware doesn't support MTE (but supports TBI). +Hardware tag-based KASAN only reports the first found bug. After that MTE tag +checking gets disabled. + What memory accesses are sanitised by KASAN? -------------------------------------------- diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index b31aeef505dd..2aad21fda156 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -48,7 +48,7 @@ EXPORT_SYMBOL(kasan_flag_enabled); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether panic or disable tag checking on fault. */ +/* Whether to panic or print a report and disable tag checking on fault. */ bool kasan_flag_panic __ro_after_init; /* kasan=off/on */ From 2956f4e4f0c504697f9dd6b84fd5c57ede35d333 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:20:42 -0800 Subject: [PATCH 470/506] alpha: remove CONFIG_EXPERIMENTAL from defconfigs Since CONFIG_EXPERIMENTAL was removed in 2013, go ahead and drop it from any defconfig files. Link: https://lkml.kernel.org/r/20210115005956.29408-1-rdunlap@infradead.org Fixes: 3d374d09f16f ("final removal of CONFIG_EXPERIMENTAL") Signed-off-by: Randy Dunlap Cc: Kees Cook Cc: Greg Kroah-Hartman Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/configs/defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 6293675db164..724c4075df40 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -1,4 +1,3 @@ -CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 From 152c432b128cb043fc107e8f211195fe94b2159c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 25 Feb 2021 17:20:45 -0800 Subject: [PATCH 471/506] proc/wchan: use printk format instead of lookup_symbol_name() To resolve the symbol fuction name for wchan, use the printk format specifier %ps instead of manually looking up the symbol function name via lookup_symbol_name(). Link: https://lkml.kernel.org/r/20201217165413.GA1959@ls3530.fritz.box Signed-off-by: Helge Deller Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 56bf14316122..3851bfcdba56 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -386,19 +385,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; - char symname[KSYM_NAME_LEN]; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto print0; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + wchan = get_wchan(task); + else + wchan = 0; - wchan = get_wchan(task); - if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_puts(m, symname); - return 0; - } + if (wchan) + seq_printf(m, "%ps", (void *) wchan); + else + seq_putc(m, '0'); -print0: - seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ From 4508943794efdd94171549c0bd52810e2f4ad9fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 25 Feb 2021 17:20:49 -0800 Subject: [PATCH 472/506] proc: use kvzalloc for our kernel buffer Since sysctl: pass kernel pointers to ->proc_handler we have been pre-allocating a buffer to copy the data from the proc handlers into, and then copying that to userspace. The problem is this just blindly kzalloc()'s the buffer size passed in from the read, which in the case of our 'cat' binary was 64kib. Order-4 allocations are not awesome, and since we can potentially allocate up to our maximum order, so use kvzalloc for these buffers. [willy@infradead.org: changelog tweaks] Link: https://lkml.kernel.org/r/6345270a2c1160b89dd5e6715461f388176899d1.1612972413.git.josef@toxicpanda.com Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Josef Bacik Reviewed-by: Christoph Hellwig Acked-by: Vlastimil Babka Cc: Al Viro Cc: Alexey Dobriyan CC: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 656ba24c317d..984e42f8cb11 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -571,7 +571,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; @@ -600,7 +600,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = count; out_free_buf: - kfree(kbuf); + kvfree(kbuf); out: sysctl_head_finish(head); From 3b3376f222e3ab58367d9dd405cafd09d5e37b7c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 25 Feb 2021 17:20:53 -0800 Subject: [PATCH 473/506] sysctl.c: fix underflow value setting risk in vm_table Apart from subsystem specific .proc_handler handler, all ctl_tables with extra1 and extra2 members set should use proc_dointvec_minmax instead of proc_dointvec, or the limit set in extra* never work and potentially echo underflow values(negative numbers) is likely make system unstable. Especially vfs_cache_pressure and zone_reclaim_mode, -1 is apparently not a valid value, but we can set to them. And then kernel may crash. # echo -1 > /proc/sys/vm/vfs_cache_pressure Link: https://lkml.kernel.org/r/20201223105535.2875-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9fbdd848138..62fbd09b5dc1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2962,7 +2962,7 @@ static struct ctl_table vm_table[] = { .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { @@ -2970,7 +2970,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ @@ -2980,7 +2980,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif @@ -2990,7 +2990,7 @@ static struct ctl_table vm_table[] = { .data = &node_reclaim_mode, .maxlen = sizeof(node_reclaim_mode), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:20:56 -0800 Subject: [PATCH 474/506] include/linux: remove repeated words Drop the doubled word "for" in a comment. {firewire-cdev.h} Drop the doubled word "in" in a comment. {input.h} Drop the doubled word "a" in a comment. {mdev.h} Drop the doubled word "the" in a comment. {ptrace.h} Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Stefan Richter Cc: Dmitry Torokhov Cc: Kirti Wankhede Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mdev.h | 2 +- include/linux/ptrace.h | 2 +- include/uapi/linux/firewire-cdev.h | 2 +- include/uapi/linux/input.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 9004375c462e..27eb383cb95d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -42,7 +42,7 @@ struct device *mdev_get_iommu_device(struct device *dev); * @mdev: mdev_device structure on of mediated device * that is being created * Returns integer: success (0) or error (< 0) - * @remove: Called to free resources in parent device's driver for a + * @remove: Called to free resources in parent device's driver for * a mediated device. It is mandatory to provide 'remove' * ops. * @mdev: mdev_device device structure which is being diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 2a9df80ea887..b5ebf6c01292 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -171,7 +171,7 @@ static inline void ptrace_event(int event, unsigned long message) * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the - * the ptrace parent's pid namespace. + * ptrace parent's pid namespace. * * Called without locks. */ diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 7e5b5c10a49c..5effa9832802 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -844,7 +844,7 @@ struct fw_cdev_queue_iso { * struct fw_cdev_start_iso - Start an isochronous transmission or reception * @cycle: Cycle in which to start I/O. If @cycle is greater than or * equal to 0, the I/O will start on that cycle. - * @sync: Determines the value to wait for for receive packets that have + * @sync: Determines the value to wait for receive packets that have * the %FW_CDEV_ISO_SYNC bit set * @tags: Tag filter bit mask. Only valid for isochronous reception. * Determines the tag values for which packets will be accepted. diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 9a61c28ed3ae..ee3127461ee0 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -84,7 +84,7 @@ struct input_id { * in units per radian. * When INPUT_PROP_ACCELEROMETER is set the resolution changes. * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in - * in units per g (units/g) and in units per degree per second + * units per g (units/g) and in units per degree per second * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ). */ struct input_absinfo { From c131bd0b5448bb577b7a9ed48c4e528807e8d5af Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 25 Feb 2021 17:21:00 -0800 Subject: [PATCH 475/506] treewide: Miguel has moved Update contact info. Link: https://lkml.kernel.org/r/20210206162524.GA11520@kernel.org Signed-off-by: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + CREDITS | 9 +++------ Documentation/admin-guide/auxdisplay/cfag12864b.rst | 2 +- Documentation/admin-guide/auxdisplay/ks0108.rst | 2 +- MAINTAINERS | 12 ++++++------ drivers/auxdisplay/cfag12864b.c | 4 ++-- drivers/auxdisplay/cfag12864bfb.c | 4 ++-- drivers/auxdisplay/ks0108.c | 4 ++-- include/linux/cfag12864b.h | 2 +- include/linux/ks0108.h | 2 +- samples/auxdisplay/cfag12864b-example.c | 2 +- 11 files changed, 21 insertions(+), 23 deletions(-) diff --git a/.mailmap b/.mailmap index 87a8bbdbf749..85b93cdefc87 100644 --- a/.mailmap +++ b/.mailmap @@ -237,6 +237,7 @@ Maxime Ripard Mayuresh Janorkar Michael Buesch Michel Dänzer +Miguel Ojeda Mike Rapoport Mike Rapoport Mike Rapoport diff --git a/CREDITS b/CREDITS index be097156bd71..cef83b958cbe 100644 --- a/CREDITS +++ b/CREDITS @@ -2841,14 +2841,11 @@ S: Subiaco, 6008 S: Perth, Western Australia S: Australia -N: Miguel Ojeda Sandonis -E: miguel.ojeda.sandonis@gmail.com -W: http://miguelojeda.es -W: http://jair.lab.fi.uva.es/~migojed/ +N: Miguel Ojeda +E: ojeda@kernel.org +W: https://ojeda.dev D: Author of the ks0108, cfag12864b and cfag12864bfb auxiliary display drivers. D: Maintainer of the auxiliary display drivers tree (drivers/auxdisplay/*) -S: C/ Mieses 20, 9-B -S: Valladolid 47009 S: Spain N: Peter Oruba diff --git a/Documentation/admin-guide/auxdisplay/cfag12864b.rst b/Documentation/admin-guide/auxdisplay/cfag12864b.rst index 18c2865bd322..da385d851acc 100644 --- a/Documentation/admin-guide/auxdisplay/cfag12864b.rst +++ b/Documentation/admin-guide/auxdisplay/cfag12864b.rst @@ -3,7 +3,7 @@ cfag12864b LCD Driver Documentation =================================== :License: GPLv2 -:Author & Maintainer: Miguel Ojeda Sandonis +:Author & Maintainer: Miguel Ojeda :Date: 2006-10-27 diff --git a/Documentation/admin-guide/auxdisplay/ks0108.rst b/Documentation/admin-guide/auxdisplay/ks0108.rst index c0b7faf73136..a7d3fe509373 100644 --- a/Documentation/admin-guide/auxdisplay/ks0108.rst +++ b/Documentation/admin-guide/auxdisplay/ks0108.rst @@ -3,7 +3,7 @@ ks0108 LCD Controller Driver Documentation ========================================== :License: GPLv2 -:Author & Maintainer: Miguel Ojeda Sandonis +:Author & Maintainer: Miguel Ojeda :Date: 2006-10-27 diff --git a/MAINTAINERS b/MAINTAINERS index 40040db747fc..e42082eccf36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2982,7 +2982,7 @@ F: include/uapi/linux/audit.h F: kernel/audit* AUXILIARY DISPLAY DRIVERS -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/ F: include/linux/cfag12864b.h @@ -4128,13 +4128,13 @@ F: scripts/extract-cert.c F: scripts/sign-file.c CFAG12864B LCD DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/cfag12864b.c F: include/linux/cfag12864b.h CFAG12864BFB LCD FRAMEBUFFER DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: drivers/auxdisplay/cfag12864bfb.c F: include/linux/cfag12864b.h @@ -4304,7 +4304,7 @@ S: Supported F: drivers/infiniband/hw/usnic/ CLANG-FORMAT FILE -M: Miguel Ojeda +M: Miguel Ojeda S: Maintained F: .clang-format @@ -4444,7 +4444,7 @@ S: Maintained F: drivers/platform/x86/compal-laptop.c COMPILER ATTRIBUTES -M: Miguel Ojeda +M: Miguel Ojeda S: Maintained F: include/linux/compiler_attributes.h @@ -9939,7 +9939,7 @@ F: include/linux/kprobes.h F: kernel/kprobes.c KS0108 LCD CONTROLLER DRIVER -M: Miguel Ojeda Sandonis +M: Miguel Ojeda S: Maintained F: Documentation/admin-guide/auxdisplay/ks0108.rst F: drivers/auxdisplay/ks0108.c diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c index 7eebae7e322c..fd430e6866a1 100644 --- a/drivers/auxdisplay/cfag12864b.c +++ b/drivers/auxdisplay/cfag12864b.c @@ -5,7 +5,7 @@ * Description: cfag12864b LCD driver * Depends: ks0108 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -376,5 +376,5 @@ module_init(cfag12864b_init); module_exit(cfag12864b_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("cfag12864b LCD driver"); diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c index 2002291ab338..d66821adf453 100644 --- a/drivers/auxdisplay/cfag12864bfb.c +++ b/drivers/auxdisplay/cfag12864bfb.c @@ -5,7 +5,7 @@ * Description: cfag12864b LCD framebuffer driver * Depends: cfag12864b * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -171,5 +171,5 @@ module_init(cfag12864bfb_init); module_exit(cfag12864bfb_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("cfag12864b LCD framebuffer driver"); diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index abfe3fa9e6f4..03c95ad4216c 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -5,7 +5,7 @@ * Description: ks0108 LCD Controller driver * Depends: parport * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ @@ -182,6 +182,6 @@ module_init(ks0108_init); module_exit(ks0108_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda "); MODULE_DESCRIPTION("ks0108 LCD Controller driver"); diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h index 4060004968c8..6617d9c68d86 100644 --- a/include/linux/cfag12864b.h +++ b/include/linux/cfag12864b.h @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: cfag12864b LCD driver header * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-12 */ diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h index 0738389b42b6..1a37a664f915 100644 --- a/include/linux/ks0108.h +++ b/include/linux/ks0108.h @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: ks0108 LCD Controller driver header * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c index bfeab44f81d0..2e3bb7375c99 100644 --- a/samples/auxdisplay/cfag12864b-example.c +++ b/samples/auxdisplay/cfag12864b-example.c @@ -4,7 +4,7 @@ * Version: 0.1.0 * Description: cfag12864b LCD userspace example program * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda * Date: 2006-10-31 */ From c1f26493ed7f363c63e0e9d91e50d4db26df6603 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Thu, 25 Feb 2021 17:21:03 -0800 Subject: [PATCH 476/506] groups: use flexible-array member in struct group_info Replace zero-size array with flexible array member, as recommended by the docs. Link: https://lkml.kernel.org/r/155995eed35c3c1bdcc56e69d8997c8e4c46740a.1611620846.git.hubert.jasudowicz@gmail.com Signed-off-by: Hubert Jasudowicz Cc: "Peter Zijlstra (Intel)" Cc: Micah Morton Cc: Gao Xiang Cc: Michael Kelley Cc: Thomas Cedeno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cred.h b/include/linux/cred.h index 18639c069263..4c6350503697 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -25,7 +25,7 @@ struct inode; struct group_info { atomic_t usage; int ngroups; - kgid_t gid[0]; + kgid_t gid[]; } __randomize_layout; /** From e1e014115dfd48ab3e3691ce46f9484ce12e67d4 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Thu, 25 Feb 2021 17:21:07 -0800 Subject: [PATCH 477/506] groups: simplify struct group_info allocation Combine kmalloc and vmalloc into a single call. Use struct_size macro instead of direct size calculation. Link: https://lkml.kernel.org/r/ba9ba5beea9a44b7196c41a0d9528abd5f20dd2e.1611620846.git.hubert.jasudowicz@gmail.com Signed-off-by: Hubert Jasudowicz Cc: Gao Xiang Cc: Micah Morton Cc: Michael Kelley Cc: "Peter Zijlstra (Intel)" Cc: Thomas Cedeno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/groups.c b/kernel/groups.c index fe7e6385530e..787b381c7c00 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -15,12 +15,7 @@ struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; - unsigned int len; - - len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; - gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); - if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); + gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; From c034f48e99907d5be147ac8f0f3e630a9307c2be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:21:10 -0800 Subject: [PATCH 478/506] kernel: delete repeated words in comments Drop repeated words in kernel/events/. {if, the, that, with, time} Drop repeated words in kernel/locking/. {it, no, the} Drop repeated words in kernel/sched/. {in, not} Link: https://lkml.kernel.org/r/20210127023412.26292-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Will Deacon [kernel/locking/] Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Will Deacon Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: Juri Lelli Cc: Vincent Guittot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 8 ++++---- kernel/events/uprobes.c | 2 +- kernel/locking/rtmutex.c | 4 ++-- kernel/locking/rwsem.c | 2 +- kernel/locking/semaphore.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 129dee540a8b..0aeca5f3c0ac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -269,7 +269,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to - * stabilize the the event->ctx relation. See + * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); @@ -1303,7 +1303,7 @@ static void put_ctx(struct perf_event_context *ctx) * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * - * But remember that that these are parent<->child context relations, and + * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * @@ -1442,7 +1442,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. * - * This has to cope with with the fact that until it is locked, + * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * @@ -2486,7 +2486,7 @@ static void perf_set_shadow_time(struct perf_event *event, * But this is a bit hairy. * * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it + * within the time source all along. We believe it * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ea7f8f92f1d..6addc9780319 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1733,7 +1733,7 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task if if necessary. + * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 03b21135313c..48fff6437901 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1420,7 +1420,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, } /* - * Performs the wakeup of the the top-waiter and re-enables preemption. + * Performs the wakeup of the top-waiter and re-enables preemption. */ void rt_mutex_postunlock(struct wake_q_head *wake_q) { @@ -1819,7 +1819,7 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Wait for the the lock acquisition started on our behalf by + * Wait for the lock acquisition started on our behalf by * rt_mutex_start_proxy_lock(). Upon failure, the caller must call * rt_mutex_cleanup_proxy_lock(). * diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ba67600c7b2c..abba5df50006 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1048,7 +1048,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* * If there were already threads queued before us and: - * 1) there are no no active locks, wake the front + * 1) there are no active locks, wake the front * queued process(es) as the handoff bit might be set. * 2) there are no active writers and some readers, the lock * must be read owned; so we try to wake any read lock diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index d9dd94defc0a..9aa855a96c4a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL(down_killable); * @sem: the semaphore to be acquired * * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. + * been acquired successfully or 1 if it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and * mutex_trylock! Be careful about this when converting code. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a8bd7b13634..794c2cb945f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5126,7 +5126,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 08ae45ad9261..acdae625c636 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -454,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) /* * For each cpu runqueue, if the task's mm match @mm, ensure that all - * @mm's membarrier state set bits are also set in in the runqueue's + * @mm's membarrier state set bits are also set in the runqueue's * membarrier state. This ensures that a runqueue scheduling * between threads which are users of @mm has its membarrier state * updated. From 7b4693e644cbdafdb2a2393fee8f81d85edd1b7d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Feb 2021 17:21:14 -0800 Subject: [PATCH 479/506] MAINTAINERS: add uapi directories to API/ABI section Let's add include/uapi/ and arch/*/include/uapi/ to API/ABI section, so that for patches modifying them, get_maintainers.pl suggests CCing linux-api@ so people don't forget. Link: https://lkml.kernel.org/r/20210217174745.13591-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: David Hildenbrand Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e42082eccf36..498cc779e354 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -261,6 +261,8 @@ ABI/API L: linux-api@vger.kernel.org F: include/linux/syscalls.h F: kernel/sys_ni.c +F: include/uapi/ +F: arch/*/include/uapi/ ABIT UGURU 1,2 HARDWARE MONITOR DRIVER M: Hans de Goede From 0e24465d3313832e82f8bd9ee2439da1367dd2e5 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Thu, 25 Feb 2021 17:21:17 -0800 Subject: [PATCH 480/506] lib/genalloc.c: change return type to unsigned long for bitmap_set_ll Just as bitmap_clear_ll(), change return type to unsigned long for bitmap_set_ll to avoid the possible overflow in future. Link: https://lkml.kernel.org/r/20210105031644.2771-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index dab97bb69df6..5dcf9cdcbc46 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -81,7 +81,8 @@ static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) * users set the same bit, one user will return remain bits, otherwise * return 0. */ -static int bitmap_set_ll(unsigned long *map, unsigned long start, unsigned long nr) +static unsigned long +bitmap_set_ll(unsigned long *map, unsigned long start, unsigned long nr) { unsigned long *p = map + BIT_WORD(start); const unsigned long size = start + nr; From a28a6e860c6cf231cf3c5171c75c342adcd00406 Mon Sep 17 00:00:00 2001 From: Francis Laniel Date: Thu, 25 Feb 2021 17:21:20 -0800 Subject: [PATCH 481/506] string.h: move fortified functions definitions in a dedicated header. This patch adds fortify-string.h to contain fortified functions definitions. Thus, the code is more separated and compile time is approximately 1% faster for people who do not set CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20210111092141.22946-1-laniel_francis@privacyrequired.com Link: https://lkml.kernel.org/r/20210111092141.22946-2-laniel_francis@privacyrequired.com Signed-off-by: Francis Laniel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fortify-string.h | 302 +++++++++++++++++++++++++++++++++ include/linux/string.h | 282 +----------------------------- 2 files changed, 303 insertions(+), 281 deletions(-) create mode 100644 include/linux/fortify-string.h diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h new file mode 100644 index 000000000000..c1be37437e77 --- /dev/null +++ b/include/linux/fortify-string.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FORTIFY_STRING_H_ +#define _LINUX_FORTIFY_STRING_H_ + + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + +__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 1); + + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_strncpy(p, q, size); +} + +__FORTIFY_INLINE char *strcat(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 1); + + if (p_size == (size_t)-1) + return __underlying_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE __kernel_size_t strlen(const char *p) +{ + __kernel_size_t ret; + size_t p_size = __builtin_object_size(p, 1); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) + return __underlying_strlen(p); + ret = strnlen(p, p_size); + if (p_size <= ret) + fortify_panic(__func__); + return ret; +} + +extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) +{ + size_t p_size = __builtin_object_size(p, 1); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); + return ret; +} + +/* defined after fortified strlen to reuse it */ +extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) +{ + size_t ret; + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + + if (__builtin_constant_p(len) && len >= p_size) + __write_overflow(); + if (len >= p_size) + fortify_panic(__func__); + __underlying_memcpy(p, q, len); + p[len] = '\0'; + } + return ret; +} + +/* defined after fortified strnlen to reuse it */ +extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); +__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size) +{ + size_t len; + /* Use string size rather than possible enclosing struct size. */ + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + /* If we cannot get size of p and q default to call strscpy. */ + if (p_size == (size_t) -1 && q_size == (size_t) -1) + return __real_strscpy(p, q, size); + + /* + * If size can be known at compile time and is greater than + * p_size, generate a compile time write overflow error. + */ + if (__builtin_constant_p(size) && size > p_size) + __write_overflow(); + + /* + * This call protects from read overflow, because len will default to q + * length if it smaller than size. + */ + len = strnlen(q, size); + /* + * If len equals size, we will copy only size bytes which leads to + * -E2BIG being returned. + * Otherwise we will copy len + 1 because of the final '\O'. + */ + len = len == size ? size : len + 1; + + /* + * Generate a runtime write overflow error if len is greater than + * p_size. + */ + if (len > p_size) + fortify_panic(__func__); + + /* + * We can now safely call vanilla strscpy because we are protected from: + * 1. Read overflow thanks to call to strnlen(). + * 2. Write overflow thanks to above ifs. + */ + return __real_strscpy(p, q, len); +} + +/* defined after fortified strlen and strnlen to reuse them */ +__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) +{ + size_t p_len, copy_len; + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strncat(p, q, count); + p_len = strlen(p); + copy_len = strnlen(q, count); + if (p_size < p_len + copy_len + 1) + fortify_panic(__func__); + __underlying_memcpy(p + p_len, q, copy_len); + p[p_len + copy_len] = '\0'; + return p; +} + +__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_memset(p, c, size); +} + +__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memcpy(p, q, size); +} + +__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memmove(p, q, size); +} + +extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); +__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memscan(p, c, size); +} + +__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + + if (__builtin_constant_p(size)) { + if (p_size < size) + __read_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __underlying_memcmp(p, q, size); +} + +__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __underlying_memchr(p, c, size); +} + +void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); +__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memchr_inv(p, c, size); +} + +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) +{ + size_t p_size = __builtin_object_size(p, 0); + + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_kmemdup(p, size, gfp); +} + +/* defined after fortified strlen and memcpy to reuse them */ +__FORTIFY_INLINE char *strcpy(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 1); + size_t q_size = __builtin_object_size(q, 1); + size_t size; + + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strcpy(p, q); + size = strlen(q) + 1; + /* test here to use the more stringent object size */ + if (p_size < size) + fortify_panic(__func__); + memcpy(p, q, size); + return p; +} + +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy + +#endif /* _LINUX_FORTIFY_STRING_H_ */ diff --git a/include/linux/string.h b/include/linux/string.h index 4fcfb56abcf5..9521d8cab18e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -266,287 +266,7 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); -extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); -extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); -extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); -extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); -extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); -extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); -extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); -extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); -extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); -#else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp -#define __underlying_memcpy __builtin_memcpy -#define __underlying_memmove __builtin_memmove -#define __underlying_memset __builtin_memset -#define __underlying_strcat __builtin_strcat -#define __underlying_strcpy __builtin_strcpy -#define __underlying_strlen __builtin_strlen -#define __underlying_strncat __builtin_strncat -#define __underlying_strncpy __builtin_strncpy -#endif - -__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 1); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_strncpy(p, q, size); -} - -__FORTIFY_INLINE char *strcat(char *p, const char *q) -{ - size_t p_size = __builtin_object_size(p, 1); - if (p_size == (size_t)-1) - return __underlying_strcat(p, q); - if (strlcat(p, q, p_size) >= p_size) - fortify_panic(__func__); - return p; -} - -__FORTIFY_INLINE __kernel_size_t strlen(const char *p) -{ - __kernel_size_t ret; - size_t p_size = __builtin_object_size(p, 1); - - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || - (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __underlying_strlen(p); - ret = strnlen(p, p_size); - if (p_size <= ret) - fortify_panic(__func__); - return ret; -} - -extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); -__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) -{ - size_t p_size = __builtin_object_size(p, 1); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); - return ret; -} - -/* defined after fortified strlen to reuse it */ -extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); -__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) -{ - size_t ret; - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __real_strlcpy(p, q, size); - ret = strlen(q); - if (size) { - size_t len = (ret >= size) ? size - 1 : ret; - if (__builtin_constant_p(len) && len >= p_size) - __write_overflow(); - if (len >= p_size) - fortify_panic(__func__); - __underlying_memcpy(p, q, len); - p[len] = '\0'; - } - return ret; -} - -/* defined after fortified strnlen to reuse it */ -extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); -__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size) -{ - size_t len; - /* Use string size rather than possible enclosing struct size. */ - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - - /* If we cannot get size of p and q default to call strscpy. */ - if (p_size == (size_t) -1 && q_size == (size_t) -1) - return __real_strscpy(p, q, size); - - /* - * If size can be known at compile time and is greater than - * p_size, generate a compile time write overflow error. - */ - if (__builtin_constant_p(size) && size > p_size) - __write_overflow(); - - /* - * This call protects from read overflow, because len will default to q - * length if it smaller than size. - */ - len = strnlen(q, size); - /* - * If len equals size, we will copy only size bytes which leads to - * -E2BIG being returned. - * Otherwise we will copy len + 1 because of the final '\O'. - */ - len = len == size ? size : len + 1; - - /* - * Generate a runtime write overflow error if len is greater than - * p_size. - */ - if (len > p_size) - fortify_panic(__func__); - - /* - * We can now safely call vanilla strscpy because we are protected from: - * 1. Read overflow thanks to call to strnlen(). - * 2. Write overflow thanks to above ifs. - */ - return __real_strscpy(p, q, len); -} - -/* defined after fortified strlen and strnlen to reuse them */ -__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) -{ - size_t p_len, copy_len; - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strncat(p, q, count); - p_len = strlen(p); - copy_len = strnlen(q, count); - if (p_size < p_len + copy_len + 1) - fortify_panic(__func__); - __underlying_memcpy(p + p_len, q, copy_len); - p[p_len + copy_len] = '\0'; - return p; -} - -__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_memset(p, c, size); -} - -__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __write_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memcpy(p, q, size); -} - -__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __write_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memmove(p, q, size); -} - -extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); -__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_memscan(p, c, size); -} - -__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); - if (__builtin_constant_p(size)) { - if (p_size < size) - __read_overflow(); - if (q_size < size) - __read_overflow2(); - } - if (p_size < size || q_size < size) - fortify_panic(__func__); - return __underlying_memcmp(p, q, size); -} - -__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __underlying_memchr(p, c, size); -} - -void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); -__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_memchr_inv(p, c, size); -} - -extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); -__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) -{ - size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) - __read_overflow(); - if (p_size < size) - fortify_panic(__func__); - return __real_kmemdup(p, size, gfp); -} - -/* defined after fortified strlen and memcpy to reuse them */ -__FORTIFY_INLINE char *strcpy(char *p, const char *q) -{ - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); - size_t size; - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strcpy(p, q); - size = strlen(q) + 1; - /* test here to use the more stringent object size */ - if (p_size < size) - fortify_panic(__func__); - memcpy(p, q, size); - return p; -} - -/* Don't use these outside the FORITFY_SOURCE implementation */ -#undef __underlying_memchr -#undef __underlying_memcmp -#undef __underlying_memcpy -#undef __underlying_memmove -#undef __underlying_memset -#undef __underlying_strcat -#undef __underlying_strcpy -#undef __underlying_strlen -#undef __underlying_strncat -#undef __underlying_strncpy +#include #endif /** From d262093656a0eec6d6114a3178a9d887fddd0ded Mon Sep 17 00:00:00 2001 From: Yogesh Lal Date: Thu, 25 Feb 2021 17:21:24 -0800 Subject: [PATCH 482/506] lib: stackdepot: add support to configure STACK_HASH_SIZE Use CONFIG_STACK_HASH_ORDER to configure STACK_HASH_SIZE. Aim is to have configurable value for STACK_HASH_SIZE, so depend on use case one can configure it. One example is of Page Owner, CONFIG_PAGE_OWNER works only if page_owner=on via kernel parameter on CONFIG_PAGE_OWNER configured system. Thus, unless admin enable it via command line option, the stackdepot will just waste 8M memory without any customer. Making it configurable and use lower value helps to enable features like CONFIG_PAGE_OWNER without any significant overhead. Link: https://lkml.kernel.org/r/1611749198-24316-1-git-send-email-vjitta@codeaurora.org Signed-off-by: Yogesh Lal Signed-off-by: Vinayak Menon Signed-off-by: Vijayanand Jitta Reviewed-by: Minchan Kim Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 9 +++++++++ lib/stackdepot.c | 3 +-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 46806332a8cc..a38cc61256f1 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -651,6 +651,15 @@ config STACKDEPOT bool select STACKTRACE +config STACK_HASH_ORDER + int "stack depot hash size (12 => 4KB, 20 => 1024KB)" + range 12 20 + default 20 + depends on STACKDEPOT + help + Select the hash size as a power of 2 for the stackdepot hash table. + Choose a lower value to reduce the memory impact. + config SBITMAP bool diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 890dcc2e984e..4b9715470e87 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -141,8 +141,7 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, return stack; } -#define STACK_HASH_ORDER 20 -#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) +#define STACK_HASH_SIZE (1L << CONFIG_STACK_HASH_ORDER) #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) #define STACK_HASH_SEED 0x9747b28c From e1fdc403349c64fa58f4c163f4bf9b860b4db808 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Thu, 25 Feb 2021 17:21:27 -0800 Subject: [PATCH 483/506] lib: stackdepot: add support to disable stack depot Add a kernel parameter stack_depot_disable to disable stack depot. So that stack hash table doesn't consume any memory when stack depot is disabled. The use case is CONFIG_PAGE_OWNER without page_owner=on. Without this patch, stackdepot will consume the memory for the hashtable. By default, it's 8M which is never trivial. With this option, in CONFIG_PAGE_OWNER configured system, page_owner=off, stack_depot_disable in kernel command line, we could save the wasted memory for the hashtable. [akpm@linux-foundation.org: fix CONFIG_STACKDEPOT=n build] Link: https://lkml.kernel.org/r/1611749198-24316-2-git-send-email-vjitta@codeaurora.org Signed-off-by: Vinayak Menon Signed-off-by: Vijayanand Jitta Cc: Alexander Potapenko Cc: Minchan Kim Cc: Yogesh Lal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 6 ++++ include/linux/stackdepot.h | 9 ++++++ init/main.c | 2 ++ lib/stackdepot.c | 32 ++++++++++++++++--- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bab6a8b01202..04545725f187 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5182,6 +5182,12 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. + stack_depot_disable= [KNL] + Setting this to true through kernel command line will + disable the stack depot thereby saving the static memory + consumed by the stack hash table. By default this is set + to false. + stacktrace [FTRACE] Enabled the stack tracer on boot up. diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 24d49c732341..6bb4bc1a5f54 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -21,4 +21,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); +#ifdef CONFIG_STACKDEPOT +int stack_depot_init(void); +#else +static inline int stack_depot_init(void) +{ + return 0; +} +#endif /* CONFIG_STACKDEPOT */ + #endif diff --git a/init/main.c b/init/main.c index 261051070e3c..3648c9f94882 100644 --- a/init/main.c +++ b/init/main.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -827,6 +828,7 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); + stack_depot_init(); mem_init(); /* page_owner must be initialized after buddy is ready */ page_ext_init_flatmem_late(); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4b9715470e87..cc21116512a7 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -31,6 +31,7 @@ #include #include #include +#include #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) @@ -145,9 +146,32 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) #define STACK_HASH_SEED 0x9747b28c -static struct stack_record *stack_table[STACK_HASH_SIZE] = { - [0 ... STACK_HASH_SIZE - 1] = NULL -}; +static bool stack_depot_disable; +static struct stack_record **stack_table; + +static int __init is_stack_depot_disabled(char *str) +{ + kstrtobool(str, &stack_depot_disable); + if (stack_depot_disable) { + pr_info("Stack Depot is disabled\n"); + stack_table = NULL; + } + return 0; +} +early_param("stack_depot_disable", is_stack_depot_disabled); + +int __init stack_depot_init(void) +{ + if (!stack_depot_disable) { + size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); + int i; + + stack_table = memblock_alloc(size, size); + for (i = 0; i < STACK_HASH_SIZE; i++) + stack_table[i] = NULL; + } + return 0; +} /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) @@ -241,7 +265,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; - if (unlikely(nr_entries == 0)) + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; hash = hash_stack(entries, nr_entries); From 64427985c76fcb54c783de617edf353009499a03 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Thu, 25 Feb 2021 17:21:31 -0800 Subject: [PATCH 484/506] lib: stackdepot: fix ignoring return value warning Fix the below ignoring return value warning for kstrtobool in is_stack_depot_disabled function. lib/stackdepot.c: In function 'is_stack_depot_disabled': lib/stackdepot.c:154:2: warning: ignoring return value of 'kstrtobool' declared with attribute 'warn_unused_result' [-Wunused-result] Link: https://lkml.kernel.org/r/1612163048-28026-1-git-send-email-vjitta@codeaurora.org Fixes: b9779abb09a8 ("lib: stackdepot: add support to disable stack depot") Signed-off-by: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cc21116512a7..49f67a0c6e5d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -151,8 +151,10 @@ static struct stack_record **stack_table; static int __init is_stack_depot_disabled(char *str) { - kstrtobool(str, &stack_depot_disable); - if (stack_depot_disable) { + int ret; + + ret = kstrtobool(str, &stack_depot_disable); + if (!ret && stack_depot_disable) { pr_info("Stack Depot is disabled\n"); stack_table = NULL; } From 96251a75e0097639a6df558e4e62f762100f03d3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:21:34 -0800 Subject: [PATCH 485/506] lib/cmdline: remove an unneeded local variable in next_arg() The local variable 'next' is unneeded because you can simply advance the existing pointer 'args'. Link: https://lkml.kernel.org/r/20210201014707.3828753-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index dfd4c4423f9a..5d474c626e24 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -228,7 +228,6 @@ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; - char *next; if (*args == '"') { args++; @@ -266,10 +265,10 @@ char *next_arg(char *args, char **param, char **val) if (args[i]) { args[i] = '\0'; - next = args + i + 1; + args += i + 1; } else - next = args + i; + args += i; /* Chew up trailing spaces. */ - return skip_spaces(next); + return skip_spaces(args); } From 4945cca232ce8bc699b8743f2436af664c471b96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 25 Feb 2021 17:21:37 -0800 Subject: [PATCH 486/506] include/linux/bitops.h: spelling s/synomyn/synonym/ Fix a misspelling of "synonym". Link: https://lkml.kernel.org/r/20210108105305.2028120-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a61f192c096b..a5a48303b0f1 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -214,7 +214,7 @@ static inline int get_count_order_long(unsigned long l) * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * - * On 64 bit arches this is a synomyn for __ffs + * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ From b5e8736a954aecd33adf276a2680dc24a36a2420 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:40 -0800 Subject: [PATCH 487/506] checkpatch: improve blank line after declaration test Avoid multiple false positives by ignoring attributes. Various attributes like volatile and ____cacheline_aligned_in_smp cause checkpatch to emit invalid "Missing a blank line after declarations" messages. Use copies of $sline and $prevline, remove $Attribute and $Sparse, and use the existing tests to avoid these false positives. Miscellanea: o Add volatile to $Attribute This also reduces checkpatch runtime a bit by moving the indentation comparison test to the start of the block to avoid multiple unnecessary regex tests. Link: https://lkml.kernel.org/r/9015fd00742bf4e5b824ad6d7fd7189530958548.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 52 ++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ca4201753d5e..bc8069503819 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -382,6 +382,7 @@ our $InitAttribute = qr{$InitAttributeData|$InitAttributeConst|$InitAttributeIni # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ const| + volatile| __percpu| __nocast| __safe| @@ -3776,43 +3777,48 @@ sub process { } # check for missing blank lines after declarations - if ($sline =~ /^\+\s+\S/ && #Not at char 1 - # actual declarations - ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || +# (declarations must have the same indentation and not be at the start of line) + if (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/) { + # use temporaries + my $sl = $sline; + my $pl = $prevline; + # remove $Attribute/$Sparse uses to simplify comparisons + $sl =~ s/\b(?:$Attribute|$Sparse)\b//g; + $pl =~ s/\b(?:$Attribute|$Sparse)\b//g; + if (($pl =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || # function pointer declarations - $prevline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || + $pl =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define - $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + $pl =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros - $prevline =~ /^\+\s+$declaration_macros/) && + $pl =~ /^\+\s+$declaration_macros/) && # for "else if" which can look like "$Ident $Ident" - !($prevline =~ /^\+\s+$c90_Keywords\b/ || + !($pl =~ /^\+\s+$c90_Keywords\b/ || # other possible extensions of declaration lines - $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || + $pl =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || # not starting a section or a macro "\" extended line - $prevline =~ /(?:\{\s*|\\)$/) && + $pl =~ /(?:\{\s*|\\)$/) && # looks like a declaration - !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + !($sl =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || # function pointer declarations - $sline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || + $sl =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define - $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + $sl =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros - $sline =~ /^\+\s+$declaration_macros/ || + $sl =~ /^\+\s+$declaration_macros/ || # start of struct or union or enum - $sline =~ /^\+\s+(?:static\s+)?(?:const\s+)?(?:union|struct|enum|typedef)\b/ || + $sl =~ /^\+\s+(?:static\s+)?(?:const\s+)?(?:union|struct|enum|typedef)\b/ || # start or end of block or continuation of declaration - $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || + $sl =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || # bitfield continuation - $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || + $sl =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || # other possible extensions of declaration lines - $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && - # indentation of previous and current line are the same - (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - if (WARN("LINE_SPACING", - "Missing a blank line after declarations\n" . $hereprev) && - $fix) { - fix_insert_line($fixlinenr, "\+"); + $sl =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { + if (WARN("LINE_SPACING", + "Missing a blank line after declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } } } From 35cdcbfc5cfc30012b790d9b077bd949ad46f1dd Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 25 Feb 2021 17:21:44 -0800 Subject: [PATCH 488/506] checkpatch: ignore warning designated initializers using NR_CPUS Some max_length wants to hold as large room as possible to ensure enough size to tackle with the biggest NR_CPUS. An example below: kernel/cgroup/cpuset.c: static struct cftype legacy_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, ... } Link: https://lkml.kernel.org/r/5d4998aa8a8ac7efada2c7daffa9e73559f8b186.1609331255.git.rocking@linux.alibaba.com Signed-off-by: Peng Wang Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bc8069503819..7ba8fbbf9f1b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -7025,12 +7025,14 @@ sub process { # use of NR_CPUS is usually wrong # ignore definitions of NR_CPUS and usage to define arrays as likely right +# ignore designated initializers using NR_CPUS if ($line =~ /\bNR_CPUS\b/ && $line !~ /^.\s*\s*#\s*if\b.*\bNR_CPUS\b/ && $line !~ /^.\s*\s*#\s*define\b.*\bNR_CPUS\b/ && $line !~ /^.\s*$Declare\s.*\[[^\]]*NR_CPUS[^\]]*\]/ && $line !~ /\[[^\]]*\.\.\.[^\]]*NR_CPUS[^\]]*\]/ && - $line !~ /\[[^\]]*NR_CPUS[^\]]*\.\.\.[^\]]*\]/) + $line !~ /\[[^\]]*NR_CPUS[^\]]*\.\.\.[^\]]*\]/ && + $line !~ /^.\s*\.\w+\s*=\s*.*\bNR_CPUS\b/) { WARN("NR_CPUS", "usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc\n" . $herecurr); From ea7dbab3e5054db7c013579096cfe7b0f10d1d65 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 25 Feb 2021 17:21:47 -0800 Subject: [PATCH 489/506] checkpatch: trivial style fixes Indentations should use tabs wherever possible. Replace spaces by tabs for indents. Link: https://lkml.kernel.org/r/20210105103044.40282-1-dwaipayanray1@gmail.com Signed-off-by: Dwaipayan Ray Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7ba8fbbf9f1b..345879a305be 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2974,7 +2974,7 @@ sub process { } if (!defined $lines[$linenr]) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); } elsif ($rawlines[$linenr] !~ /^\s*signed-off-by:\s*(.*)/i) { WARN("BAD_SIGN_OFF", "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); @@ -2997,8 +2997,8 @@ sub process { if (ERROR("GERRIT_CHANGE_ID", "Remove Gerrit Change-Id's before submitting upstream\n" . $herecurr) && $fix) { - fix_delete_line($fixlinenr, $rawline); - } + fix_delete_line($fixlinenr, $rawline); + } } # Check if the commit log is in a possible stack dump @@ -3240,10 +3240,10 @@ sub process { next if ($start_char =~ /^\S$/); next if (index(" \t.,;?!", $end_char) == -1); - # avoid repeating hex occurrences like 'ff ff fe 09 ...' - if ($first =~ /\b[0-9a-f]{2,}\b/i) { - next if (!exists($allow_repeated_words{lc($first)})); - } + # avoid repeating hex occurrences like 'ff ff fe 09 ...' + if ($first =~ /\b[0-9a-f]{2,}\b/i) { + next if (!exists($allow_repeated_words{lc($first)})); + } if (WARN("REPEATED_WORD", "Possible repeated word: '$first'\n" . $herecurr) && @@ -4423,7 +4423,7 @@ sub process { WARN("STATIC_CONST_CHAR_ARRAY", "char * array declaration might be better as static const\n" . $herecurr); - } + } # check for sizeof(foo)/sizeof(foo[0]) that could be ARRAY_SIZE(foo) if ($line =~ m@\bsizeof\s*\(\s*($Lval)\s*\)@) { @@ -5276,7 +5276,7 @@ sub process { $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) { WARN("RETURN_VOID", "void function return statements are not generally useful\n" . $hereprev); - } + } # if statements using unnecessary parentheses - ie: if ((foo == bar)) if ($perl_version_ok && From adb2da82fcf99b6006fbaf3e3cd12649365fc967 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:50 -0800 Subject: [PATCH 490/506] checkpatch: prefer ftrace over function entry/exit printks Prefer using ftrace over function entry/exit logging messages. Warn with various function entry/exit only logging that only use __func__ with or without descriptive decoration. Link: https://lkml.kernel.org/r/47c01081533a417c99c9a80a4cd537f8c308503f.camel@perches.com Signed-off-by: Joe Perches Cc: Dan Carpenter Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 345879a305be..736129c21c16 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -507,6 +507,30 @@ our $signature_tags = qr{(?xi: Cc: )}; +our $tracing_logging_tags = qr{(?xi: + [=-]*> | + <[=-]* | + \[ | + \] | + start | + called | + entered | + entry | + enter | + in | + inside | + here | + begin | + exit | + end | + done | + leave | + completed | + out | + return | + [\.\!:\s]* +)}; + sub edit_distance_min { my (@arr) = @_; my $len = scalar @arr; @@ -5972,6 +5996,17 @@ sub process { "Prefer using '\"%s...\", __func__' to using '$context_function', this function's name, in a string\n" . $herecurr); } +# check for unnecessary function tracing like uses +# This does not use $logFunctions because there are many instances like +# 'dprintk(FOO, "%s()\n", __func__);' which do not match $logFunctions + if ($rawline =~ /^\+.*\([^"]*"$tracing_logging_tags{0,3}%s(?:\s*\(\s*\)\s*)?$tracing_logging_tags{0,3}(?:\\n)?"\s*,\s*__func__\s*\)\s*;/) { + if (WARN("TRACING_LOGGING", + "Unnecessary ftrace-like logging - prefer using ftrace\n" . $herecurr) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + } + } + # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", From 0972b8bfe0de8c0f05796aceb8f2428b0efb20cd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:21:54 -0800 Subject: [PATCH 491/506] checkpatch: improve TYPECAST_INT_CONSTANT test message Improve the TYPECAST_INT_CONSTANT test by showing the suggested conversion for various type of uses like (unsigned int)1 to 1U. Link: https://lkml.kernel.org/r/ecefe8dcb93fe7028311b69dd297ba52224233d4.camel@perches.com Signed-off-by: Joe Perches Cc: Douglas Gilbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 736129c21c16..a04df2657d49 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6518,18 +6518,18 @@ sub process { if ($line =~ /(\(\s*$C90_int_types\s*\)\s*)($Constant)\b/) { my $cast = $1; my $const = $2; + my $suffix = ""; + my $newconst = $const; + $newconst =~ s/${Int_type}$//; + $suffix .= 'U' if ($cast =~ /\bunsigned\b/); + if ($cast =~ /\blong\s+long\b/) { + $suffix .= 'LL'; + } elsif ($cast =~ /\blong\b/) { + $suffix .= 'L'; + } if (WARN("TYPECAST_INT_CONSTANT", - "Unnecessary typecast of c90 int constant\n" . $herecurr) && + "Unnecessary typecast of c90 int constant - '$cast$const' could be '$const$suffix'\n" . $herecurr) && $fix) { - my $suffix = ""; - my $newconst = $const; - $newconst =~ s/${Int_type}$//; - $suffix .= 'U' if ($cast =~ /\bunsigned\b/); - if ($cast =~ /\blong\s+long\b/) { - $suffix .= 'LL'; - } elsif ($cast =~ /\blong\b/) { - $suffix .= 'L'; - } $fixed[$fixlinenr] =~ s/\Q$cast\E$const\b/$newconst$suffix/; } } From de93245c00a44578ae73964b7e36607d04fed5b3 Mon Sep 17 00:00:00 2001 From: Aditya Srivastava Date: Thu, 25 Feb 2021 17:21:57 -0800 Subject: [PATCH 492/506] checkpatch: add warning for avoiding .L prefix symbols in assembly files objtool requires that all code must be contained in an ELF symbol. Symbol names that have a '.L' prefix do not emit symbol table entries, as they have special meaning for the assembler. '.L' prefixed symbols can be used within a code region, but should be avoided for denoting a range of code via 'SYM_*_START/END' annotations. Add a new check to emit a warning on finding the usage of '.L' symbols for '.S' files, if it denotes range of code via SYM_*_START/END annotation pair. Link: https://lkml.kernel.org/r/20210123190459.9701-1-yashsri421@gmail.com Link: https://lore.kernel.org/lkml/20210112210154.GI4646@sirena.org.uk Signed-off-by: Aditya Srivastava Suggested-by: Mark Brown Acked-by: Joe Perches Acked-by: Nick Desaulniers Cc: Aditya Srivastava Cc: Lukas Bulwahn Cc: Dwaipayan Ray Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a04df2657d49..d8793bdbc492 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3599,6 +3599,13 @@ sub process { } } +# check for .L prefix local symbols in .S files + if ($realfile =~ /\.S$/ && + $line =~ /^\+\s*(?:[A-Z]+_)?SYM_[A-Z]+_(?:START|END)(?:_[A-Z_]+)?\s*\(\s*\.L/) { + WARN("AVOID_L_PREFIX", + "Avoid using '.L' prefixed local symbol names for denoting a range of code via 'SYM_*_START/END' annotations; see Documentation/asm-annotations.rst\n" . $herecurr); + } + # check we are in a valid source file C or perl if not then ignore this hunk next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/); From 58f02267f04a79a5ef13dfbcf30f5ae080389f87 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 25 Feb 2021 17:22:01 -0800 Subject: [PATCH 493/506] checkpatch: add kmalloc_array_node to unnecessary OOM message check commit 5799b255c491 ("include/linux/slab.h: add kmalloc_array_node() and kcalloc_node()") was added in 2017. Update the unnecessary OOM message test to include it. Link: https://lkml.kernel.org/r/b9dc4a808b1518e08ab8761480d9872e5d18e7cd.camel@perches.com Signed-off-by: Joe Perches Reported-by: Jakub Kicinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d8793bdbc492..3cdc0703e00f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -487,7 +487,7 @@ our $logFunctions = qr{(?x: our $allocFunctions = qr{(?x: (?:(?:devm_)? - (?:kv|k|v)[czm]alloc(?:_node|_array)? | + (?:kv|k|v)[czm]alloc(?:_array)?(?:_node)? | kstrdup(?:_const)? | kmemdup(?:_nul)?) | (?:\w+)?alloc_skb(?:_ip_align)? | From 263afd39c06f5939ef943e0d535380d4b8e56484 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Thu, 25 Feb 2021 17:22:04 -0800 Subject: [PATCH 494/506] checkpatch: don't warn about colon termination in linker scripts This check erroneously flags cases like the one in my recent printk enumeration patch[0], where the spaces are syntactic, and `section:' vs. `section :' is syntactically important: ERROR: space prohibited before that ':' (ctx:WxW) #258: FILE: include/asm-generic/vmlinux.lds.h:314: + .printk_fmts : AT(ADDR(.printk_fmts) - LOAD_OFFSET) { 0: https://lore.kernel.org/patchwork/patch/1375749/ Link: https://lkml.kernel.org/r/YBwhqsc2TIVeid3t@chrisdown.name Link: https://lkml.kernel.org/r/YB6UsjCOy1qrrlSD@chrisdown.name Signed-off-by: Chris Down Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3cdc0703e00f..75c93316547b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5044,7 +5044,7 @@ sub process { # A colon needs no spaces before when it is # terminating a case value or a label. } elsif ($opv eq ':C' || $opv eq ':L') { - if ($ctx =~ /Wx./) { + if ($ctx =~ /Wx./ and $realfile !~ m@.*\.lds\.h$@) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); From 5b8f82e1a17695c9e5fec5842b234967782d7e5b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 17:22:08 -0800 Subject: [PATCH 495/506] checkpatch: do not apply "initialise globals to 0" check to BPF progs BPF programs explicitly initialise global variables to 0 to make sure clang (v10 or older) do not put the variables in the common section. Skip "initialise globals to 0" check for BPF programs to elimiate error messages like: ERROR: do not initialise globals to 0 #19: FILE: samples/bpf/tracex1_kern.c:21: Link: https://lkml.kernel.org/r/20210209211954.490077-1-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 75c93316547b..df8b23dc1eb0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2453,6 +2453,15 @@ sub get_raw_comment { return $comment; } +sub exclude_global_initialisers { + my ($realfile) = @_; + + # Do not check for BPF programs (tools/testing/selftests/bpf/progs/*.c, samples/bpf/*_kern.c, *.bpf.c). + return $realfile =~ m@^tools/testing/selftests/bpf/progs/.*\.c$@ || + $realfile =~ m@^samples/bpf/.*_kern\.c$@ || + $realfile =~ m@/bpf/.*\.bpf\.c$@; +} + sub process { my $filename = shift; @@ -4358,7 +4367,8 @@ sub process { } # check for global initialisers. - if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/) { + if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/ && + !exclude_global_initialisers($realfile)) { if (ERROR("GLOBAL_INITIALISERS", "do not initialise globals to $1\n" . $herecurr) && $fix) { From 073a9ecb3a73401662430bb955aedeac1de643d1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:22:11 -0800 Subject: [PATCH 496/506] init/version.c: remove Version_ symbol This code hunk creates a Version_ symbol if CONFIG_KALLSYMS is disabled. For example, building the kernel v5.10 for allnoconfig creates the following symbol: $ nm vmlinux | grep Version_ c116b028 B Version_330240 There is no in-tree user of this symbol. Commit 197dcffc8ba0 ("init/version.c: define version_string only if CONFIG_KALLSYMS is not defined") mentions that Version_* is only used with ksymoops. However, a commit in the pre-git era [1] had added the statement, "ksymoops is useless on 2.6. Please use the Oops in its original format". That statement existed until commit 4eb9241127a0 ("Documentation: admin-guide: update bug-hunting.rst") finally removed the stale ksymoops information. This symbol is no longer needed. [1] https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=ad68b2f085f5c79e4759ca2d13947b3c885ee831 Link: https://lkml.kernel.org/r/20210120033452.2895170-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Cc: Mauro Carvalho Chehab Cc: Randy Dunlap Cc: Daniel Guilak Cc: Lee Revell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/version.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/init/version.c b/init/version.c index 80d2b7566b39..92afc782b043 100644 --- a/init/version.c +++ b/init/version.c @@ -16,14 +16,6 @@ #include #include -#ifndef CONFIG_KALLSYMS -#define version(a) Version_ ## a -#define version_string(a) version(a) - -extern int version_string(LINUX_VERSION_CODE); -int version_string(LINUX_VERSION_CODE); -#endif - struct uts_namespace init_uts_ns = { .ns.count = REFCOUNT_INIT(2), .name = { From a5a673f7312253a842f3da8c60c980461cc269ec Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Feb 2021 17:22:15 -0800 Subject: [PATCH 497/506] init: clean up early_param_on_off() macro Use early_param() to define early_param_on_off(). Link: https://lkml.kernel.org/r/20210201041532.4025025-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Reviewed-by: Johan Hovold Reviewed-by: Miguel Ojeda Cc: Masahiro Yamada Cc: Joe Perches Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index a01f01c1a5c5..31f54de58429 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -338,14 +338,14 @@ struct obs_kernel_param { var = 1; \ return 0; \ } \ - __setup_param(str_on, parse_##var##_on, parse_##var##_on, 1); \ + early_param(str_on, parse_##var##_on); \ \ static int __init parse_##var##_off(char *arg) \ { \ var = 0; \ return 0; \ } \ - __setup_param(str_off, parse_##var##_off, parse_##var##_off, 1) + early_param(str_off, parse_##var##_off) /* Relies on boot_command_line being set */ void __init parse_early_param(void); From f9c8bc4604c95a7c55293f244f67753f6e96096f Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 25 Feb 2021 17:22:18 -0800 Subject: [PATCH 498/506] init/Kconfig: fix a typo in CC_VERSION_TEXT help text s/compier/compiler/ Link: https://lkml.kernel.org/r/20210224223325.29099-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Reviewed-by: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 719871f8727c..efdc35abccb6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -19,7 +19,7 @@ config CC_VERSION_TEXT CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. When the compiler is updated, Kconfig will be invoked. - - Ensure full rebuild when the compier is updated + - Ensure full rebuild when the compiler is updated include/linux/kconfig.h contains this option in the comment line so fixdep adds include/config/cc/version/text.h into the auto-generated dependency. When the compiler is updated, syncconfig will touch it From 3159ed57792be7453793bda27297a423e1c63d6c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 25 Feb 2021 17:22:22 -0800 Subject: [PATCH 499/506] fs/coredump: use kmap_local_page() In dump_user_range() there is no reason for the mapping to be global. Use kmap_local_page() rather than kmap. Link: https://lkml.kernel.org/r/20210203223328.558945-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index ae778937a1ff..1c0fdc1aa70b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -897,10 +897,10 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap(page); + void *kaddr = kmap_local_page(page); stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); + kunmap_local(kaddr); put_page(page); } else { stop = !dump_skip(cprm, PAGE_SIZE); From b3656d8227f4c45812c6b40815d8f4e446ed372a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Feb 2021 17:22:25 -0800 Subject: [PATCH 500/506] seq_file: document how per-entry resources are managed. Patch series "Fix some seq_file users that were recently broken". A recent change to seq_file broke some users which were using seq_file in a non-"standard" way ... though the "standard" isn't documented, so they can be excused. The result is a possible leak - of memory in one case, of references to a 'transport' in the other. These three patches: 1/ document and explain the problem 2/ fix the problem user in x86 3/ fix the problem user in net/sctp This patch (of 3): Users of seq_file will sometimes find it convenient to take a resource, such as a lock or memory allocation, in the ->start or ->next operations. These are per-entry resources, distinct from per-session resources which are taken in ->start and released in ->stop. The preferred management of these is release the resource on the subsequent call to ->next or ->stop. However prior to Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") it happened that ->show would always be called after ->start or ->next, and a few users chose to release the resource in ->show. This is no longer reliable. Since the mentioned commit, ->next will always come after a successful ->show (to ensure m->index is updated correctly), so the original ordering cannot be maintained. This patch updates the documentation to clearly state the required behaviour. Other patches will fix the few problematic users. [akpm@linux-foundation.org: fix typo, per Willy] Link: https://lkml.kernel.org/r/161248518659.21478.2484341937387294998.stgit@noble1 Link: https://lkml.kernel.org/r/161248539020.21478.3147971477400875336.stgit@noble1 Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Signed-off-by: NeilBrown Cc: Xin Long Cc: Alexander Viro Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Vlad Yasevich Cc: Neil Horman Cc: Marcelo Ricardo Leitner Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/seq_file.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/filesystems/seq_file.rst b/Documentation/filesystems/seq_file.rst index 56856481dc8d..a6726082a7c2 100644 --- a/Documentation/filesystems/seq_file.rst +++ b/Documentation/filesystems/seq_file.rst @@ -217,6 +217,12 @@ between the calls to start() and stop(), so holding a lock during that time is a reasonable thing to do. The seq_file code will also avoid taking any other locks while the iterator is active. +The iterater value returned by start() or next() is guaranteed to be +passed to a subsequent next() or stop() call. This allows resources +such as locks that were taken to be reliably released. There is *no* +guarantee that the iterator will be passed to show(), though in practice +it often will be. + Formatted output ================ From 3d2fc4c082448e9c05792f9b2a11c1d5db408b85 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Feb 2021 17:22:29 -0800 Subject: [PATCH 501/506] x86: fix seq_file iteration for pat/memtype.c The memtype seq_file iterator allocates a buffer in the ->start and ->next functions and frees it in the ->show function. The preferred handling for such resources is to free them in the subsequent ->next or ->stop function call. Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") there is no guarantee that ->show will be called after ->next, so this function can now leak memory. So move the freeing of the buffer to ->next and ->stop. Link: https://lkml.kernel.org/r/161248539022.21478.13874455485854739066.stgit@noble1 Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Signed-off-by: NeilBrown Cc: Xin Long Cc: Alexander Viro Cc: Andy Lutomirski Cc: Dave Hansen Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Marcelo Ricardo Leitner Cc: Neil Horman Cc: Peter Zijlstra Cc: Vlad Yasevich Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat/memtype.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 8f665c352bf0..ca311aaa67b8 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1164,12 +1164,14 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + kfree(v); ++*pos; return memtype_get_idx(*pos); } static void memtype_seq_stop(struct seq_file *seq, void *v) { + kfree(v); } static int memtype_seq_show(struct seq_file *seq, void *v) @@ -1181,8 +1183,6 @@ static int memtype_seq_show(struct seq_file *seq, void *v) entry_print->end, cattr_name(entry_print->type)); - kfree(entry_print); - return 0; } From db7fbf492d94a0b59d8f85b3184231662586dea9 Mon Sep 17 00:00:00 2001 From: George Prekas Date: Thu, 25 Feb 2021 17:22:34 -0800 Subject: [PATCH 502/506] scripts/gdb: fix list_for_each If the list is uninitialized (next pointer is NULL), list_for_each gets stuck in an infinite loop. Print a message and treat list as empty. Link: https://lkml.kernel.org/r/4ae23bb1-c333-f669-da2d-fa35c4f49018@amazon.com Signed-off-by: George Prekas Reviewed-by: Jan Kiszka Cc: Kieran Bingham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/lists.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py index c487ddf09d38..bae4d70b7eae 100644 --- a/scripts/gdb/linux/lists.py +++ b/scripts/gdb/linux/lists.py @@ -27,6 +27,11 @@ def list_for_each(head): raise TypeError("Must be struct list_head not {}" .format(head.type)) + if head['next'] == 0: + gdb.write("list_for_each: Uninitialized list '{}' treated as empty\n" + .format(head.address)) + return + node = head['next'].dereference() while node.address != head.address: yield node.address From d54ce6158e354f5358a547b96299ecd7f3725393 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 25 Feb 2021 17:22:38 -0800 Subject: [PATCH 503/506] kgdb: fix to kill breakpoints on initmem after boot Currently breakpoints in kernel .init.text section are not handled correctly while allowing to remove them even after corresponding pages have been freed. Fix it via killing .init.text section breakpoints just prior to initmem pages being freed. Doug: "HW breakpoints aren't handled by this patch but it's probably not such a big deal". Link: https://lkml.kernel.org/r/20210224081652.587785-1-sumit.garg@linaro.org Signed-off-by: Sumit Garg Suggested-by: Doug Anderson Acked-by: Doug Anderson Acked-by: Daniel Thompson Tested-by: Daniel Thompson Cc: Masami Hiramatsu Cc: Steven Rostedt (VMware) Cc: Jason Wessel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ init/main.c | 1 + kernel/debug/debug_core.c | 11 +++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 0444b44bd156..392a3670944c 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -359,9 +359,11 @@ extern atomic_t kgdb_active; extern bool dbg_is_early; extern void __init dbg_late_init(void); extern void kgdb_panic(const char *msg); +extern void kgdb_free_init_mem(void); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() static inline void kgdb_panic(const char *msg) {} +static inline void kgdb_free_init_mem(void) { } #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/init/main.c b/init/main.c index 3648c9f94882..53b278845b88 100644 --- a/init/main.c +++ b/init/main.c @@ -1426,6 +1426,7 @@ static int __ref kernel_init(void *unused) async_synchronize_full(); kprobe_free_init_mem(); ftrace_free_init_mem(); + kgdb_free_init_mem(); free_initmem(); mark_readonly(); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b636d517c02c..4708aec492df 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -455,6 +455,17 @@ int dbg_remove_all_break(void) return 0; } +void kgdb_free_init_mem(void) +{ + int i; + + /* Clear init memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0)) + kgdb_break[i].state = BP_UNDEFINED; + } +} + #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { From 6aaa31aeb9cf260e1b7155cc11ec864f052db5ec Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Feb 2021 17:22:42 -0800 Subject: [PATCH 504/506] ubsan: remove overflow checks Since GCC 8.0 -fsanitize=signed-integer-overflow doesn't work with -fwrapv. -fwrapv makes signed overflows defines and GCC essentially disables ubsan checks. On GCC < 8.0 -fwrapv doesn't have influence on -fsanitize=signed-integer-overflow setting, so it kinda works but generates false-positves and violates uaccess rules: lib/iov_iter.o: warning: objtool: iovec_from_user()+0x22d: call to __ubsan_handle_add_overflow() with UACCESS enabled Disable signed overflow checks to avoid these problems. Remove unsigned overflow checks as well. Unsigned overflow appeared as side effect of commit cdf8a76fda4a ("ubsan: move cc-option tests into Kconfig"), but it never worked (kernel doesn't boot). And unsigned overflows are allowed by C standard, so it just pointless. Link: https://lkml.kernel.org/r/20210209232348.20510-1-ryabinin.a.a@gmail.com Signed-off-by: Andrey Ryabinin Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Randy Dunlap Cc: Stephen Rothwell Cc: Dmitry Vyukov Cc: Kees Cook Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 17 ----------- lib/test_ubsan.c | 49 ------------------------------ lib/ubsan.c | 68 ------------------------------------------ scripts/Makefile.ubsan | 2 -- 4 files changed, 136 deletions(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 3a0b1c930733..e5372a13511d 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -112,23 +112,6 @@ config UBSAN_UNREACHABLE This option enables -fsanitize=unreachable which checks for control flow reaching an expected-to-be-unreachable position. -config UBSAN_SIGNED_OVERFLOW - bool "Perform checking for signed arithmetic overflow" - default UBSAN - depends on $(cc-option,-fsanitize=signed-integer-overflow) - help - This option enables -fsanitize=signed-integer-overflow which checks - for overflow of any arithmetic operations with signed integers. - -config UBSAN_UNSIGNED_OVERFLOW - bool "Perform checking for unsigned arithmetic overflow" - depends on $(cc-option,-fsanitize=unsigned-integer-overflow) - depends on !X86_32 # avoid excessive stack usage on x86-32/clang - help - This option enables -fsanitize=unsigned-integer-overflow which checks - for overflow of any arithmetic operations with unsigned integers. This - currently causes x86 to fail to boot. - config UBSAN_OBJECT_SIZE bool "Perform checking for accesses beyond the end of objects" default UBSAN diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 5e5d9355ef49..7e7bbd0f3fd2 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -11,51 +11,6 @@ typedef void(*test_ubsan_fp)(void); #config, IS_ENABLED(config) ? "y" : "n"); \ } while (0) -static void test_ubsan_add_overflow(void) -{ - volatile int val = INT_MAX; - volatile unsigned int uval = UINT_MAX; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val += 2; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval += 2; -} - -static void test_ubsan_sub_overflow(void) -{ - volatile int val = INT_MIN; - volatile unsigned int uval = 0; - volatile int val2 = 2; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val -= val2; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval -= val2; -} - -static void test_ubsan_mul_overflow(void) -{ - volatile int val = INT_MAX / 2; - volatile unsigned int uval = UINT_MAX / 2; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val *= 3; - - UBSAN_TEST(CONFIG_UBSAN_UNSIGNED_OVERFLOW); - uval *= 3; -} - -static void test_ubsan_negate_overflow(void) -{ - volatile int val = INT_MIN; - - UBSAN_TEST(CONFIG_UBSAN_SIGNED_OVERFLOW); - val = -val; -} - static void test_ubsan_divrem_overflow(void) { volatile int val = 16; @@ -155,10 +110,6 @@ static void test_ubsan_object_size_mismatch(void) } static const test_ubsan_fp test_ubsan_array[] = { - test_ubsan_add_overflow, - test_ubsan_sub_overflow, - test_ubsan_mul_overflow, - test_ubsan_negate_overflow, test_ubsan_shift_out_of_bounds, test_ubsan_out_of_bounds, test_ubsan_load_invalid_value, diff --git a/lib/ubsan.c b/lib/ubsan.c index bec38c64d6a6..26229973049d 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -163,74 +163,6 @@ static void ubsan_epilogue(void) } } -static void handle_overflow(struct overflow_data *data, void *lhs, - void *rhs, char op) -{ - - struct type_descriptor *type = data->type; - char lhs_val_str[VALUE_LENGTH]; - char rhs_val_str[VALUE_LENGTH]; - - if (suppress_report(&data->location)) - return; - - ubsan_prologue(&data->location, type_is_signed(type) ? - "signed-integer-overflow" : - "unsigned-integer-overflow"); - - val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs); - val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs); - pr_err("%s %c %s cannot be represented in type %s\n", - lhs_val_str, - op, - rhs_val_str, - type->type_name); - - ubsan_epilogue(); -} - -void __ubsan_handle_add_overflow(void *data, - void *lhs, void *rhs) -{ - - handle_overflow(data, lhs, rhs, '+'); -} -EXPORT_SYMBOL(__ubsan_handle_add_overflow); - -void __ubsan_handle_sub_overflow(void *data, - void *lhs, void *rhs) -{ - handle_overflow(data, lhs, rhs, '-'); -} -EXPORT_SYMBOL(__ubsan_handle_sub_overflow); - -void __ubsan_handle_mul_overflow(void *data, - void *lhs, void *rhs) -{ - handle_overflow(data, lhs, rhs, '*'); -} -EXPORT_SYMBOL(__ubsan_handle_mul_overflow); - -void __ubsan_handle_negate_overflow(void *_data, void *old_val) -{ - struct overflow_data *data = _data; - char old_val_str[VALUE_LENGTH]; - - if (suppress_report(&data->location)) - return; - - ubsan_prologue(&data->location, "negation-overflow"); - - val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val); - - pr_err("negation of %s cannot be represented in type %s:\n", - old_val_str, data->type->type_name); - - ubsan_epilogue(); -} -EXPORT_SYMBOL(__ubsan_handle_negate_overflow); - - void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { struct overflow_data *data = _data; diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 0e53a93e8f15..9e2092fd5206 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -8,8 +8,6 @@ ubsan-cflags-$(CONFIG_UBSAN_LOCAL_BOUNDS) += -fsanitize=local-bounds ubsan-cflags-$(CONFIG_UBSAN_SHIFT) += -fsanitize=shift ubsan-cflags-$(CONFIG_UBSAN_DIV_ZERO) += -fsanitize=integer-divide-by-zero ubsan-cflags-$(CONFIG_UBSAN_UNREACHABLE) += -fsanitize=unreachable -ubsan-cflags-$(CONFIG_UBSAN_SIGNED_OVERFLOW) += -fsanitize=signed-integer-overflow -ubsan-cflags-$(CONFIG_UBSAN_UNSIGNED_OVERFLOW) += -fsanitize=unsigned-integer-overflow ubsan-cflags-$(CONFIG_UBSAN_OBJECT_SIZE) += -fsanitize=object-size ubsan-cflags-$(CONFIG_UBSAN_BOOL) += -fsanitize=bool ubsan-cflags-$(CONFIG_UBSAN_ENUM) += -fsanitize=enum From dd23e8098f33a55b22b869bc7fc0a795ccbb9f87 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 25 Feb 2021 17:22:46 -0800 Subject: [PATCH 505/506] initramfs: panic with memory information On systems with large amounts of reserved memory we may fail to successfully complete unpack_to_rootfs() and be left with: Kernel panic - not syncing: write error this is not too helpful to understand what happened, so let's wrap the panic() calls with a surrounding show_mem() such that we have a chance of understanding the memory conditions leading to these allocation failures. [akpm@linux-foundation.org: replace macro with C function] Link: https://lkml.kernel.org/r/20210114231517.1854379-1-f.fainelli@gmail.com Signed-off-by: Florian Fainelli Cc: Barret Rhoden Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 55b74d7e5260..5fa84711127a 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,16 @@ static void __init error(char *x) message = x; } +static void panic_show_mem(const char *fmt, ...) +{ + va_list args; + + show_mem(0, NULL); + va_start(args, fmt); + panic(fmt, args); + va_end(args); +} + /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) @@ -80,7 +91,7 @@ static char __init *find_link(int major, int minor, int ino, } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) - panic("can't allocate link hash entry"); + panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; @@ -125,7 +136,7 @@ static void __init dir_add(const char *name, time64_t mtime) { struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); if (!de) - panic("can't allocate dir_entry buffer"); + panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); de->name = kstrdup(name, GFP_KERNEL); de->mtime = mtime; @@ -460,7 +471,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) - panic("can't allocate buffers"); + panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; @@ -607,7 +618,7 @@ static int __init populate_rootfs(void) /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) - panic("%s", err); /* Failed to decompress INTERNAL initramfs */ + panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; From f685a533a7fab35c5d069dcd663f59c8e4171a75 Mon Sep 17 00:00:00 2001 From: Huang Pei Date: Thu, 25 Feb 2021 17:22:49 -0800 Subject: [PATCH 506/506] MIPS: make userspace mapping young by default MIPS page fault path(except huge page) takes 3 exceptions (1 TLB Miss + 2 TLB Invalid), butthe second TLB Invalid exception is just triggered by __update_tlb from do_page_fault writing tlb without _PAGE_VALID set. With this patch, user space mapping prot is made young by default (with both _PAGE_VALID and _PAGE_YOUNG set), and it only take 1 TLB Miss + 1 TLB Invalid exception Remove pte_sw_mkyoung without polluting MM code and make page fault delay of MIPS on par with other architecture Link: https://lkml.kernel.org/r/20210204013942.8398-1-huangpei@loongson.cn Signed-off-by: Huang Pei Reviewed-by: Nicholas Piggin Acked-by: Acked-by: Thomas Bogendoerfer Cc: Christophe Leroy Cc: Cc: Bibo Mao Cc: Jiaxun Yang Cc: Paul Burton Cc: Li Xuefeng Cc: Yang Tiezhu Cc: Gao Juxin Cc: Fuxin Zhang Cc: Huacai Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/cache.c | 30 ++++++++++++++++-------------- include/linux/pgtable.h | 8 -------- mm/memory.c | 4 ---- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 1754498b0717..7719d632df8d 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -157,29 +157,31 @@ unsigned long _page_cachable_default; EXPORT_SYMBOL(_page_cachable_default); #define PM(p) __pgprot(_page_cachable_default | (p)) +#define PVA(p) PM(_PAGE_VALID | _PAGE_ACCESSED | (p)) static inline void setup_protection_map(void) { protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = PM(_PAGE_PRESENT); - protection_map[5] = PM(_PAGE_PRESENT); - protection_map[6] = PM(_PAGE_PRESENT); - protection_map[7] = PM(_PAGE_PRESENT); + protection_map[1] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[2] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[3] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[4] = PVA(_PAGE_PRESENT); + protection_map[5] = PVA(_PAGE_PRESENT); + protection_map[6] = PVA(_PAGE_PRESENT); + protection_map[7] = PVA(_PAGE_PRESENT); protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | + protection_map[9] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); - protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = PM(_PAGE_PRESENT); - protection_map[13] = PM(_PAGE_PRESENT); - protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); - protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); + protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); + protection_map[12] = PVA(_PAGE_PRESENT); + protection_map[13] = PVA(_PAGE_PRESENT); + protection_map[14] = PVA(_PAGE_PRESENT); + protection_map[15] = PVA(_PAGE_PRESENT); } +#undef _PVA #undef PM void cpu_cache_init(void) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 36eb748f3c97..cdfc4e9f253e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -432,14 +432,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ -#ifndef pte_sw_mkyoung -static inline pte_t pte_sw_mkyoung(pte_t pte) -{ - return pte; -} -#define pte_sw_mkyoung pte_sw_mkyoung -#endif - #ifndef pte_savedwrite #define pte_savedwrite pte_write #endif diff --git a/mm/memory.c b/mm/memory.c index 784249f3307b..c8e357627318 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2902,7 +2902,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3560,7 +3559,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3745,8 +3743,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); - else - entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma);