From c5d4d7d83165ae863954b113c7f403d8b58febed Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 4 Sep 2017 10:28:02 +0200 Subject: [PATCH 001/217] xfrm: Fix deletion of offloaded SAs on failure. When we off load a SA, it gets pushed to the NIC before we can add it. In case of a failure, we don't delete this SA from the NIC. Fix this by calling xfrm_dev_state_delete on failure. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Reported-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2bfbd9121e3b..b997f1395357 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) { x->km.state = XFRM_STATE_DEAD; + xfrm_dev_state_delete(x); __xfrm_state_put(x); goto out; } From 67a63387b1417b5954eedb15f638f1f0bee3da49 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 4 Sep 2017 10:59:55 +0200 Subject: [PATCH 002/217] xfrm: Fix negative device refcount on offload failure. Reset the offload device at the xfrm_state if the device was not able to offload the state. Otherwise we drop the device refcount twice. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Reported-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index acf00104ef31..30e5746085b8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + xso->dev = NULL; dev_put(dev); return 0; } From 23e9fcfef1f3d10675acce023592796851bcaf1a Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 12 Sep 2017 14:53:46 +0300 Subject: [PATCH 003/217] vti: fix NULL dereference in xfrm_input() Can be reproduced with LTP tests: # icmp-uni-vti.sh -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 IPv4: RIP: 0010:xfrm_input+0x7f9/0x870 ... Call Trace: vti_input+0xaa/0x110 [ip_vti] ? skb_free_head+0x21/0x40 vti_rcv+0x33/0x40 [ip_vti] xfrm4_ah_rcv+0x33/0x60 ip_local_deliver_finish+0x94/0x1e0 ip_local_deliver+0x6f/0xe0 ? ip_route_input_noref+0x28/0x50 ... # icmp-uni-vti.sh -6 -p ah -a sha256 -m tunnel -S fffffffe -k 1 -s 10 IPv6: RIP: 0010:xfrm_input+0x7f9/0x870 ... Call Trace: xfrm6_rcv_tnl+0x3c/0x40 vti6_rcv+0xd5/0xe0 [ip6_vti] xfrm6_ah_rcv+0x33/0x60 ip6_input_finish+0xee/0x460 ip6_input+0x3f/0xb0 ip6_rcv_finish+0x45/0xa0 ipv6_rcv+0x34b/0x540 xfrm_input() invokes xfrm_rcv_cb() -> vti_rcv_cb(), the last callback might call skb_scrub_packet(), which in turn can reset secpath. Fix it by adding a check that skb->sp is not NULL. Fixes: 7e9e9202bccc ("xfrm: Clear RX SKB secpath xfrm_offload") Signed-off-by: Alexey Kodanev Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2515cd2bc5db..8ac9d32fb79d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -429,7 +429,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) nf_reset(skb); if (decaps) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -440,7 +441,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; From a6c215e21b0dc5fe9416dce90f9acc2ea53c4502 Mon Sep 17 00:00:00 2001 From: Jeffrey Chu Date: Fri, 8 Sep 2017 21:08:58 +0000 Subject: [PATCH 004/217] USB: serial: ftdi_sio: add id for Cypress WICED dev board Add CYPRESS_VID vid and CYPRESS_WICED_BT_USB and CYPRESS_WICED_WL_USB device IDs to ftdi_sio driver. Signed-off-by: Jeffrey Chu Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ftdi_sio.c | 2 ++ drivers/usb/serial/ftdi_sio_ids.h | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 1cec03799cdf..49d1b2d4606d 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1015,6 +1015,8 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(WICED_VID, WICED_USB20706V2_PID) }, { USB_DEVICE(TI_VID, TI_CC3200_LAUNCHPAD_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) }, + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) }, { } /* Terminating entry */ }; diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 4fcf1cecb6d7..f9d15bd62785 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -609,6 +609,13 @@ #define ADI_GNICE_PID 0xF000 #define ADI_GNICEPLUS_PID 0xF001 +/* + * Cypress WICED USB UART + */ +#define CYPRESS_VID 0x04B4 +#define CYPRESS_WICED_BT_USB_PID 0x009B +#define CYPRESS_WICED_WL_USB_PID 0xF900 + /* * Microchip Technology, Inc. * From 837ddc4793a69b256ac5e781a5e729b448a8d983 Mon Sep 17 00:00:00 2001 From: Henryk Heisig Date: Mon, 11 Sep 2017 17:57:34 +0200 Subject: [PATCH 005/217] USB: serial: option: add support for TP-Link LTE module This commit adds support for TP-Link LTE mPCIe module is used in in TP-Link MR200v1, MR6400v1 and v2 routers. Signed-off-by: Henryk Heisig Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 54bfef13966a..ba672cf4e888 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -522,6 +522,7 @@ static void option_instat_callback(struct urb *urb); /* TP-LINK Incorporated products */ #define TPLINK_VENDOR_ID 0x2357 +#define TPLINK_PRODUCT_LTE 0x000D #define TPLINK_PRODUCT_MA180 0x0201 /* Changhong products */ @@ -2011,6 +2012,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600A) }, { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600E) }, + { USB_DEVICE_AND_INTERFACE_INFO(TPLINK_VENDOR_ID, TPLINK_PRODUCT_LTE, 0xff, 0x00, 0x00) }, /* TP-Link LTE Module */ { USB_DEVICE(TPLINK_VENDOR_ID, TPLINK_PRODUCT_MA180), .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, { USB_DEVICE(TPLINK_VENDOR_ID, 0x9000), /* TP-Link MA260 */ From 7d34cd12061d2cf38f7505c0ab2ae2f03fbe8e08 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:12:02 -0400 Subject: [PATCH 006/217] MAINTAINERS: associate linux/fs.h with VFS instead of file locking include/linux/fs.h and include/uapi/linux/fs.h deal with much more than just file locking. Move them to the "FILESYSTEMS (VFS and infrastructure)" section of the MAINTAINERS file so that the first suggestion from get_maintainer.pl isn't the file locking maintainers, which has caused some confusion. Cc: J. Bruce Fields Cc: Al Viro Signed-off-by: Eric Biggers Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..31ac2c0ec8ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5334,9 +5334,7 @@ M: "J. Bruce Fields" L: linux-fsdevel@vger.kernel.org S: Maintained F: include/linux/fcntl.h -F: include/linux/fs.h F: include/uapi/linux/fcntl.h -F: include/uapi/linux/fs.h F: fs/fcntl.c F: fs/locks.c @@ -5345,6 +5343,8 @@ M: Alexander Viro L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/* +F: include/linux/fs.h +F: include/uapi/linux/fs.h FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio From 7eac35ea29dc54cbc8399de84c9bf16553575b89 Mon Sep 17 00:00:00 2001 From: Sebastian Frei Date: Tue, 12 Sep 2017 09:50:59 +0200 Subject: [PATCH 007/217] USB: serial: cp210x: fix partnum regression When adding GPIO support for the cp2105, the mentioned commit by Martyn Welch introduced a query for the part number of the chip. Unfortunately the driver aborts probing when this query fails, so currently the driver can not be used with chips not supporting this query. I have a data cable for Siemens mobile phones (ID 10ab:10c5) where this is the case. With this patch the driver can be bound even if the part number can not be queried. Fixes: cf5276ce7867 ("USB: serial: cp210x: Adding GPIO support for CP2105") Signed-off-by: Sebastian Frei [ johan: amend commit message; shorten error message and demote to warning; drop unnecessary move of usb_set_serial_data() ] Cc: stable # 4.9 Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 2d945c9f975c..8364b44f4c45 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -352,6 +352,7 @@ static struct usb_serial_driver * const serial_drivers[] = { #define CP210X_PARTNUM_CP2104 0x04 #define CP210X_PARTNUM_CP2105 0x05 #define CP210X_PARTNUM_CP2108 0x08 +#define CP210X_PARTNUM_UNKNOWN 0xFF /* CP210X_GET_COMM_STATUS returns these 0x13 bytes */ struct cp210x_comm_status { @@ -1491,8 +1492,11 @@ static int cp210x_attach(struct usb_serial *serial) result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PARTNUM, &priv->partnum, sizeof(priv->partnum)); - if (result < 0) - goto err_free_priv; + if (result < 0) { + dev_warn(&serial->interface->dev, + "querying part number failed\n"); + priv->partnum = CP210X_PARTNUM_UNKNOWN; + } usb_set_serial_data(serial, priv); @@ -1505,10 +1509,6 @@ static int cp210x_attach(struct usb_serial *serial) } return 0; -err_free_priv: - kfree(priv); - - return result; } static void cp210x_disconnect(struct usb_serial *serial) From 311de3ce96f453a5a6edb3066eb6109cc07e81d2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 6 Sep 2017 10:40:52 +0900 Subject: [PATCH 008/217] gpio: thunderx: select IRQ_DOMAIN_HIERARCHY instead of depends on IRQ_DOMAIN_HIERARCHY is not user-configurable, but supposed to be selected by drivers that need IRQ domain hierarchy support. GPIO_THUNDERX is the only user of "depends on IRQ_DOMAIN_HIERARCHY". This means, we can not enable GPIO_THUNDERX unless other drivers select IRQ_DOMAIN_HIERARCHY elsewhere. This is odd. Flip the logic. Signed-off-by: Masahiro Yamada Acked-by: David Daney Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3388d54ba114..3f80f167ed56 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -453,7 +453,8 @@ config GPIO_TS4800 config GPIO_THUNDERX tristate "Cavium ThunderX/OCTEON-TX GPIO" depends on ARCH_THUNDER || (64BIT && COMPILE_TEST) - depends on PCI_MSI && IRQ_DOMAIN_HIERARCHY + depends on PCI_MSI + select IRQ_DOMAIN_HIERARCHY select IRQ_FASTEOI_HIERARCHY_HANDLERS help Say yes here to support the on-chip GPIO lines on the ThunderX From e40a3ae1f794a35c4af3746291ed6fedc1fa0f6f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 17:47:45 +0200 Subject: [PATCH 009/217] gpio: acpi: work around false-positive -Wstring-overflow warning gcc-7 notices that the pin_table is an array of 16-bit numbers, but fails to take the following range check into account: drivers/gpio/gpiolib-acpi.c: In function 'acpi_gpiochip_request_interrupt': drivers/gpio/gpiolib-acpi.c:206:24: warning: '%02X' directive writing between 2 and 4 bytes into a region of size 3 [-Wformat-overflow=] sprintf(ev_name, "_%c%02X", ^~~~ drivers/gpio/gpiolib-acpi.c:206:20: note: directive argument in the range [0, 65535] sprintf(ev_name, "_%c%02X", ^~~~~~~~~ drivers/gpio/gpiolib-acpi.c:206:3: note: 'sprintf' output between 5 and 7 bytes into a destination of size 5 sprintf(ev_name, "_%c%02X", ^~~~~~~~~~~~~~~~~~~~~~~~~~~ agpio->triggering == ACPI_EDGE_SENSITIVE ? 'E' : 'L', ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pin); ~~~~ As suggested by Andy, this changes the format string to have a fixed length. Since modifying the range check did not help, I also opened a bug against gcc, see link below. Fixes: 0d1c28a449c6 ("gpiolib-acpi: Add ACPI5 event model support to gpio.") Cc: Andy Shevchenko Link: https://patchwork.kernel.org/patch/9840801/ Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82123 Signed-off-by: Arnd Bergmann Acked-by: Mika Westerberg Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 4d2113530735..eb4528c87c0b 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -203,7 +203,7 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, if (pin <= 255) { char ev_name[5]; - sprintf(ev_name, "_%c%02X", + sprintf(ev_name, "_%c%02hhX", agpio->triggering == ACPI_EDGE_SENSITIVE ? 'E' : 'L', pin); if (ACPI_SUCCESS(acpi_get_handle(handle, ev_name, &evt_handle))) From c496ad835c31ad639b6865714270b3003df031f6 Mon Sep 17 00:00:00 2001 From: Andreas Engel Date: Mon, 18 Sep 2017 21:11:57 +0200 Subject: [PATCH 010/217] USB: serial: cp210x: add support for ELV TFD500 Add the USB device id for the ELV TFD500 data logger. Signed-off-by: Andreas Engel Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 8364b44f4c45..412f812522ee 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -177,6 +177,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */ + { USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */ { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */ { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */ { USB_DEVICE(0x1901, 0x0194) }, /* GE Healthcare Remote Alarm Box */ From e4b2ae7a8a11c5d4e0a6e21ba65d4b487a15d3d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Sep 2017 22:42:21 +0200 Subject: [PATCH 011/217] gpio: omap: omap_gpio_show_rev is not __init The probe function calls omap_gpio_show_rev(), which on most compilers is inlined, but on the old gcc-4.6 is not, causing a valid warning about the incorrect __init annotation: WARNING: vmlinux.o(.text+0x40f614): Section mismatch in reference from the function omap_gpio_probe() to the function .init.text:omap_gpio_show_rev() The function omap_gpio_probe() references the function __init omap_gpio_show_rev(). This is often because omap_gpio_probe lacks a __init annotation or the annotation of omap_gpio_show_rev is wrong. This removes the __init. Signed-off-by: Arnd Bergmann Acked-by: Santosh Shilimkar Signed-off-by: Linus Walleij --- drivers/gpio/gpio-omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index dbf869fb63ce..22d7d4838265 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -1010,7 +1010,7 @@ static void omap_gpio_set(struct gpio_chip *chip, unsigned offset, int value) /*---------------------------------------------------------------------*/ -static void __init omap_gpio_show_rev(struct gpio_bank *bank) +static void omap_gpio_show_rev(struct gpio_bank *bank) { static bool called; u32 rev; From c84284e59d60514539cb06741972adf60b14a5a3 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 19 Sep 2017 21:04:56 +0200 Subject: [PATCH 012/217] pinctrl: bcm2835: fix build warning in bcm2835_gpio_irq_handle_bank This patch fix the following build warning: drivers/pinctrl/bcm/pinctrl-bcm2835.c:376:15: warning: variable 'type' set but not used [-Wunused-but-set-variable] Furthermore, it is unused for a long time, at least since commit 85ae9e512f43 ("pinctrl: bcm2835: switch to GPIOLIB_IRQCHIP") where a "FIXME no clue why the code looks up the type here" was added. A year after, nobody answeered this question, so its time to remove it. Signed-off-by: Corentin Labbe Acked-by: Stefan Wahren Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 0944310225db..ff782445dfb7 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -373,16 +373,12 @@ static void bcm2835_gpio_irq_handle_bank(struct bcm2835_pinctrl *pc, unsigned long events; unsigned offset; unsigned gpio; - unsigned int type; events = bcm2835_gpio_rd(pc, GPEDS0 + bank * 4); events &= mask; events &= pc->enabled_irq_map[bank]; for_each_set_bit(offset, &events, 32) { gpio = (32 * bank) + offset; - /* FIXME: no clue why the code looks up the type here */ - type = pc->irq_type[gpio]; - generic_handle_irq(irq_linear_revmap(pc->gpio_chip.irqdomain, gpio)); } From 87a2f622cc6446c7d09ac655b7b9b04886f16a4c Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 18 Sep 2017 11:16:26 +0300 Subject: [PATCH 013/217] dmaengine: edma: Align the memcpy acnt array size with the transfer Memory to Memory transfers does not have any special alignment needs regarding to acnt array size, but if one of the areas are in memory mapped regions (like PCIe memory), we need to make sure that the acnt array size is aligned with the mem copy parameters. Before "dmaengine: edma: Optimize memcpy operation" change the memcpy was set up in a different way: acnt == number of bytes in a word based on __ffs((src | dest | len), bcnt and ccnt for looping the necessary number of words to comlete the trasnfer. Instead of reverting the commit we can fix it to make sure that the ACNT size is aligned to the traswnfer. Fixes: df6694f80365a (dmaengine: edma: Optimize memcpy operation) Signed-off-by: Peter Ujfalusi Cc: stable@vger.kernel.org Signed-off-by: Vinod Koul --- drivers/dma/edma.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index 3879f80a4815..a7ea20e7b8e9 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -1143,11 +1143,24 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( struct edma_desc *edesc; struct device *dev = chan->device->dev; struct edma_chan *echan = to_edma_chan(chan); - unsigned int width, pset_len; + unsigned int width, pset_len, array_size; if (unlikely(!echan || !len)) return NULL; + /* Align the array size (acnt block) with the transfer properties */ + switch (__ffs((src | dest | len))) { + case 0: + array_size = SZ_32K - 1; + break; + case 1: + array_size = SZ_32K - 2; + break; + default: + array_size = SZ_32K - 4; + break; + } + if (len < SZ_64K) { /* * Transfer size less than 64K can be handled with one paRAM @@ -1169,7 +1182,7 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( * When the full_length is multibple of 32767 one slot can be * used to complete the transfer. */ - width = SZ_32K - 1; + width = array_size; pset_len = rounddown(len, width); /* One slot is enough for lengths multiple of (SZ_32K -1) */ if (unlikely(pset_len == len)) @@ -1217,7 +1230,7 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( } dest += pset_len; src += pset_len; - pset_len = width = len % (SZ_32K - 1); + pset_len = width = len % array_size; ret = edma_config_pset(chan, &edesc->pset[1], src, dest, 1, width, pset_len, DMA_MEM_TO_MEM); From 2ccb4837c938357233a0b8818e3ca3e58242c952 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Sep 2017 14:35:32 +0300 Subject: [PATCH 014/217] dmaengine: ti-dma-crossbar: Fix possible race condition with dma_inuse When looking for unused xbar_out lane we should also protect the set_bit() call with the same mutex to protect against concurrent threads picking the same ID. Fixes: ec9bfa1e1a796 ("dmaengine: ti-dma-crossbar: dra7: Use bitops instead of idr") Signed-off-by: Peter Ujfalusi Cc: stable@vger.kernel.org Signed-off-by: Vinod Koul --- drivers/dma/ti-dma-crossbar.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/ti-dma-crossbar.c b/drivers/dma/ti-dma-crossbar.c index 2f65a8fde21d..f1d04b70ee67 100644 --- a/drivers/dma/ti-dma-crossbar.c +++ b/drivers/dma/ti-dma-crossbar.c @@ -262,13 +262,14 @@ static void *ti_dra7_xbar_route_allocate(struct of_phandle_args *dma_spec, mutex_lock(&xbar->mutex); map->xbar_out = find_first_zero_bit(xbar->dma_inuse, xbar->dma_requests); - mutex_unlock(&xbar->mutex); if (map->xbar_out == xbar->dma_requests) { + mutex_unlock(&xbar->mutex); dev_err(&pdev->dev, "Run out of free DMA requests\n"); kfree(map); return ERR_PTR(-ENOMEM); } set_bit(map->xbar_out, xbar->dma_inuse); + mutex_unlock(&xbar->mutex); map->xbar_in = (u16)dma_spec->args[0]; From 40784d72aeeb4d95cf74ea2243223a85193f0e84 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Aug 2017 00:19:54 +0200 Subject: [PATCH 015/217] crypto: axis - hide an unused variable Without CONFIG_DEBUG_FS, we get a harmless warning: drivers/crypto/axis/artpec6_crypto.c:352:23: error: 'dbgfs_root' defined but not used [-Werror=unused-variable] This moves it into the #ifdef that hides the only user. Fixes: a21eb94fc4d3 ("crypto: axis - add ARTPEC-6/7 crypto accelerator driver") Signed-off-by: Arnd Bergmann Acked-by: Lars Persson Signed-off-by: Herbert Xu --- drivers/crypto/axis/artpec6_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index d9fbbf01062b..0f9754e07719 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -349,8 +349,6 @@ struct artpec6_crypto_aead_req_ctx { /* The crypto framework makes it hard to avoid this global. */ static struct device *artpec6_crypto_dev; -static struct dentry *dbgfs_root; - #ifdef CONFIG_FAULT_INJECTION static DECLARE_FAULT_ATTR(artpec6_crypto_fail_status_read); static DECLARE_FAULT_ATTR(artpec6_crypto_fail_dma_array_full); @@ -2984,6 +2982,8 @@ struct dbgfs_u32 { char *desc; }; +static struct dentry *dbgfs_root; + static void artpec6_crypto_init_debugfs(void) { dbgfs_root = debugfs_create_dir("artpec6_crypto", NULL); From b621129f4f08c8d42ac4de2e77a07c5cf0c4b740 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 15 Feb 2017 16:33:56 +0300 Subject: [PATCH 016/217] netfilter: ipvs: full-functionality option for ECN encapsulation in tunnel IPVS tunnel mode works as simple tunnel (see RFC 3168) copying ECN field to outer header. That's result in packet drops on egress tunnels in case the egress tunnel operates as ECN-capable with Full-functionality option (like ip_tunnel and ip6_tunnel kernel modules), according to RFC 3168 section 9.1.1 recommendation. This patch implements ECN full-functionality option into ipvs xmit code. Cc: netdev@vger.kernel.org Cc: lvs-devel@vger.kernel.org Signed-off-by: Vadim Fedorenko Reviewed-by: Konstantin Khlebnikov Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 90d396814798..4527921b1c3a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, { struct sk_buff *new_skb = NULL; struct iphdr *old_iph = NULL; + __u8 old_dsfield; #ifdef CONFIG_IP_VS_IPV6 struct ipv6hdr *old_ipv6h = NULL; #endif @@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *payload_len = ntohs(old_ipv6h->payload_len) + sizeof(*old_ipv6h); - *dsfield = ipv6_get_dsfield(old_ipv6h); + old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; if (df) *df = 0; @@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, /* fix old IP header checksum */ ip_send_check(old_iph); - *dsfield = ipv4_get_dsfield(old_iph); + old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) *payload_len = ntohs(old_iph->tot_len); } + /* Implement full-functionality option for ECN encapsulation */ + *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); + return skb; error: kfree_skb(skb); From 89fcbb564f4a64c439d597c2702f990eed49c8a1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 21 Sep 2017 19:01:36 -0600 Subject: [PATCH 017/217] netfilter: xt_socket: Restore mark from full sockets only An out of bounds error was detected on an ARM64 target with Android based kernel 4.9. This occurs while trying to restore mark on a skb from an inet request socket. BUG: KASAN: slab-out-of-bounds in socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 Read of size 4 at addr ffffffc06a8d824c by task syz-fuzzer/1532 CPU: 7 PID: 1532 Comm: syz-fuzzer Tainted: G W O 4.9.41+ #1 Call trace: [] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:76 [] show_stack+0x28/0x38 arch/arm64/kernel/traps.c:226 [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0xe4/0x134 lib/dump_stack.c:51 [] print_address_description+0x68/0x258 mm/kasan/report.c:248 [] kasan_report_error mm/kasan/report.c:347 [inline] [] kasan_report.part.2+0x228/0x2f0 mm/kasan/report.c:371 [] kasan_report+0x5c/0x70 mm/kasan/report.c:372 [] check_memory_region_inline mm/kasan/kasan.c:308 [inline] [] __asan_load4+0x88/0xa0 mm/kasan/kasan.c:740 [] socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248 [] socket_mt4_v1_v2_v3+0x3c/0x48 net/netfilter/xt_socket.c:272 [] ipt_do_table+0x54c/0xad8 net/ipv4/netfilter/ip_tables.c:311 [] iptable_mangle_hook+0x6c/0x220 net/ipv4/netfilter/iptable_mangle.c:90 ... Allocated by task 1532: save_stack_trace_tsk+0x0/0x2a0 arch/arm64/kernel/stacktrace.c:131 save_stack_trace+0x28/0x38 arch/arm64/kernel/stacktrace.c:215 save_stack mm/kasan/kasan.c:495 [inline] set_track mm/kasan/kasan.c:507 [inline] kasan_kmalloc+0xd8/0x188 mm/kasan/kasan.c:599 kasan_slab_alloc+0x14/0x20 mm/kasan/kasan.c:537 slab_post_alloc_hook mm/slab.h:417 [inline] slab_alloc_node mm/slub.c:2728 [inline] slab_alloc mm/slub.c:2736 [inline] kmem_cache_alloc+0x14c/0x2e8 mm/slub.c:2741 reqsk_alloc include/net/request_sock.h:87 [inline] inet_reqsk_alloc+0x4c/0x238 net/ipv4/tcp_input.c:6236 tcp_conn_request+0x2b0/0xea8 net/ipv4/tcp_input.c:6341 tcp_v4_conn_request+0xe0/0x100 net/ipv4/tcp_ipv4.c:1256 tcp_rcv_state_process+0x384/0x18a8 net/ipv4/tcp_input.c:5926 tcp_v4_do_rcv+0x2f0/0x3e0 net/ipv4/tcp_ipv4.c:1430 tcp_v4_rcv+0x1278/0x1350 net/ipv4/tcp_ipv4.c:1709 ip_local_deliver_finish+0x174/0x3e0 net/ipv4/ip_input.c:216 v1->v2: Change socket_mt6_v1_v2_v3() as well as mentioned by Eric v2->v3: Put the correct fixes tag Fixes: 01555e74bde5 ("netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index e75ef39669c5..575d2153e3b8 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) @@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) From 48596a8ddc46f96afb6a2cd72787cb15d6bb01fc Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 23 Sep 2017 23:37:40 +0200 Subject: [PATCH 018/217] netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses Wrong comparison prevented the hash types to add a range with more than 2^31 addresses but reported as a success. Fixes Netfilter's bugzilla id #1005, reported by Oleg Serditov and Oliver Ford. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 22 +++++++++++--------- net/netfilter/ipset/ip_set_hash_ipmark.c | 2 +- net/netfilter/ipset/ip_set_hash_ipport.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportip.c | 2 +- net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_net.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 2 +- net/netfilter/ipset/ip_set_hash_netnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netport.c | 2 +- net/netfilter/ipset/ip_set_hash_netportnet.c | 4 ++-- 10 files changed, 24 insertions(+), 22 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 20bfbd315f61..613eb212cb48 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ip &= ip_set_hostmask(h->netmask); + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; - if (adt == IPSET_TEST) { - e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); - } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { @@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - if (retried) + if (retried) { ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip += hosts) { e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + } + for (; ip <= ip_to;) { ret = adtfn(set, &e, &ext, &ext, flags); - if (ret && !ip_set_eexist(ret, flags)) return ret; + ip += hosts; + e.ip = htonl(ip); + if (e.ip == 0) + return 0; + ret = 0; } return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index b64cf14e8352..f3ba8348cf9d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f438740e6c6a..ddb8039ec1d2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6215fb898c50..a7f4d7a85420 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5ab1b99a53c2..a2f19b9906e9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; @@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5d9e895452e7..1c67a1761e45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 44cf11939c91..d417074f1c1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index db614e13b193..7f9ae2e9645b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 54b64b6cd0cd..e6ef382febe4 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index aff846960ac4..8602f2595a1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) @@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = (retried && ip == ntohl(h->next.ip[0]) && p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); From e23ed762db7ed1950a6408c3be80bc56909ab3d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 11:57:54 +0200 Subject: [PATCH 019/217] netfilter: ipset: pernet ops must be unregistered last Removing the ipset module leaves a small window where one cpu performs module removal while another runs a command like 'ipset flush'. ipset uses net_generic(), unregistering the pernet ops frees this storage area. Fix it by first removing the user-visible api handlers and the pernet ops last. Fixes: 1785e8f473082 ("netfiler: ipset: Add net namespace for ipset") Reported-by: Li Shuang Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e495b5e484b1..a7f049ff3049 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2072,25 +2072,28 @@ static struct pernet_operations ip_set_net_ops = { static int __init ip_set_init(void) { - int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + int ret = register_pernet_subsys(&ip_set_net_ops); - if (ret != 0) { - pr_err("ip_set: cannot register with nfnetlink.\n"); + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); return ret; } + + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { + pr_err("ip_set: cannot register with nfnetlink.\n"); + unregister_pernet_subsys(&ip_set_net_ops); + return ret; + } + ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } - ret = register_pernet_subsys(&ip_set_net_ops); - if (ret) { - pr_err("ip_set: cannot register pernet_subsys.\n"); - nf_unregister_sockopt(&so_set); - nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - return ret; - } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2098,9 +2101,10 @@ ip_set_init(void) static void __exit ip_set_fini(void) { - unregister_pernet_subsys(&ip_set_net_ops); nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + + unregister_pernet_subsys(&ip_set_net_ops); pr_debug("these are the famous last words\n"); } From 83b31c2a5fdd4fb3a4ec84c59a962e816d0bc9de Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 26 Sep 2017 15:51:28 +0200 Subject: [PATCH 020/217] pinctrl/amd: Fix build dependency on pinmux code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 79d2c8bede2c93f943 ("pinctrl/amd: save pin registers over suspend/resume") caused the following compilation errors: drivers/pinctrl/pinctrl-amd.c: In function ‘amd_gpio_should_save’: drivers/pinctrl/pinctrl-amd.c:741:8: error: ‘const struct pin_desc’ has no member named ‘mux_owner’ if (pd->mux_owner || pd->gpio_owner || ^ drivers/pinctrl/pinctrl-amd.c:741:25: error: ‘const struct pin_desc’ has no member named ‘gpio_owner’ if (pd->mux_owner || pd->gpio_owner || We need to enable CONFIG_PINMUX for this driver as well. Cc: stable@vger.kernel.org Fixes: 79d2c8bede2c93f943 ("pinctrl/amd: save pin registers over suspend/resume") Signed-off-by: Petr Mladek Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index 1778cf4f81c7..82cd8b08d71f 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -100,6 +100,7 @@ config PINCTRL_AMD tristate "AMD GPIO pin control" depends on GPIOLIB select GPIOLIB_IRQCHIP + select PINMUX select PINCONF select GENERIC_PINCONF help From dd269db84908d4d3f7c0efed85bf9d8939fb0b9b Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 27 Sep 2017 14:25:37 +0200 Subject: [PATCH 021/217] xfrm: don't call xfrm_policy_cache_flush under xfrm_state_lock I might be wrong but it doesn't look like xfrm_state_lock is required for xfrm_policy_cache_flush and calling it under this lock triggers both "sleeping function called from invalid context" and "possible circular locking dependency detected" warnings on flush. Fixes: ec30d78c14a8 xfrm: add xdst pcpu cache Signed-off-by: Artem Savkov Acked-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0dab1cd79ce4..12213477cd3a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -732,12 +732,12 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) } } } +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) { err = 0; xfrm_policy_cache_flush(); } -out: - spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_flush); From d9ec46416de5ef83220f1c7010ee0f5d1be1d753 Mon Sep 17 00:00:00 2001 From: Sylvain Lesne Date: Mon, 18 Sep 2017 13:08:00 +0200 Subject: [PATCH 022/217] dmaengine: altera: fix response FIFO emptying Commit 6084fc2ec478 ("dmaengine: altera: Use macros instead of structs to describe the registers") introduced a minus sign before a register offset. This leads to soft-locks of the DMA controller, since reading the last status byte is required to pop the response from the FIFO. Failing to do so will lead to a full FIFO, which means that the DMA controller will stop processing descriptors. Signed-off-by: Sylvain Lesne Reviewed-by: Stefan Roese Signed-off-by: Vinod Koul --- drivers/dma/altera-msgdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/altera-msgdma.c b/drivers/dma/altera-msgdma.c index 32905d5606ac..35cbf2365f68 100644 --- a/drivers/dma/altera-msgdma.c +++ b/drivers/dma/altera-msgdma.c @@ -698,7 +698,7 @@ static void msgdma_tasklet(unsigned long data) * bits. So we need to just drop these values. */ size = ioread32(mdev->resp + MSGDMA_RESP_BYTES_TRANSFERRED); - status = ioread32(mdev->resp - MSGDMA_RESP_STATUS); + status = ioread32(mdev->resp + MSGDMA_RESP_STATUS); msgdma_complete_descriptor(mdev); msgdma_chan_desc_cleanup(mdev); From edf10919e5fc8dfd10e57ed72f651204559bc6ba Mon Sep 17 00:00:00 2001 From: Sylvain Lesne Date: Mon, 18 Sep 2017 13:08:01 +0200 Subject: [PATCH 023/217] dmaengine: altera: fix spinlock usage Since this lock is acquired in both process and IRQ context, failing to to disable IRQs when trying to acquire the lock in process context can lead to deadlocks. Signed-off-by: Sylvain Lesne Reviewed-by: Stefan Roese Signed-off-by: Vinod Koul --- drivers/dma/altera-msgdma.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/dma/altera-msgdma.c b/drivers/dma/altera-msgdma.c index 35cbf2365f68..339186f25a2a 100644 --- a/drivers/dma/altera-msgdma.c +++ b/drivers/dma/altera-msgdma.c @@ -212,11 +212,12 @@ struct msgdma_device { static struct msgdma_sw_desc *msgdma_get_descriptor(struct msgdma_device *mdev) { struct msgdma_sw_desc *desc; + unsigned long flags; - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, flags); desc = list_first_entry(&mdev->free_list, struct msgdma_sw_desc, node); list_del(&desc->node); - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, flags); INIT_LIST_HEAD(&desc->tx_list); @@ -306,13 +307,14 @@ static dma_cookie_t msgdma_tx_submit(struct dma_async_tx_descriptor *tx) struct msgdma_device *mdev = to_mdev(tx->chan); struct msgdma_sw_desc *new; dma_cookie_t cookie; + unsigned long flags; new = tx_to_desc(tx); - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, flags); cookie = dma_cookie_assign(tx); list_add_tail(&new->node, &mdev->pending_list); - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, flags); return cookie; } @@ -336,17 +338,18 @@ msgdma_prep_memcpy(struct dma_chan *dchan, dma_addr_t dma_dst, struct msgdma_extended_desc *desc; size_t copy; u32 desc_cnt; + unsigned long irqflags; desc_cnt = DIV_ROUND_UP(len, MSGDMA_MAX_TRANS_LEN); - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, irqflags); if (desc_cnt > mdev->desc_free_cnt) { spin_unlock_bh(&mdev->lock); dev_dbg(mdev->dev, "mdev %p descs are not available\n", mdev); return NULL; } mdev->desc_free_cnt -= desc_cnt; - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, irqflags); do { /* Allocate and populate the descriptor */ @@ -397,18 +400,19 @@ msgdma_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl, u32 desc_cnt = 0, i; struct scatterlist *sg; u32 stride; + unsigned long irqflags; for_each_sg(sgl, sg, sg_len, i) desc_cnt += DIV_ROUND_UP(sg_dma_len(sg), MSGDMA_MAX_TRANS_LEN); - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, irqflags); if (desc_cnt > mdev->desc_free_cnt) { spin_unlock_bh(&mdev->lock); dev_dbg(mdev->dev, "mdev %p descs are not available\n", mdev); return NULL; } mdev->desc_free_cnt -= desc_cnt; - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, irqflags); avail = sg_dma_len(sgl); @@ -566,10 +570,11 @@ static void msgdma_start_transfer(struct msgdma_device *mdev) static void msgdma_issue_pending(struct dma_chan *chan) { struct msgdma_device *mdev = to_mdev(chan); + unsigned long flags; - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, flags); msgdma_start_transfer(mdev); - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, flags); } /** @@ -634,10 +639,11 @@ static void msgdma_free_descriptors(struct msgdma_device *mdev) static void msgdma_free_chan_resources(struct dma_chan *dchan) { struct msgdma_device *mdev = to_mdev(dchan); + unsigned long flags; - spin_lock_bh(&mdev->lock); + spin_lock_irqsave(&mdev->lock, flags); msgdma_free_descriptors(mdev); - spin_unlock_bh(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, flags); kfree(mdev->sw_desq); } @@ -682,8 +688,9 @@ static void msgdma_tasklet(unsigned long data) u32 count; u32 __maybe_unused size; u32 __maybe_unused status; + unsigned long flags; - spin_lock(&mdev->lock); + spin_lock_irqsave(&mdev->lock, flags); /* Read number of responses that are available */ count = ioread32(mdev->csr + MSGDMA_CSR_RESP_FILL_LEVEL); @@ -704,7 +711,7 @@ static void msgdma_tasklet(unsigned long data) msgdma_chan_desc_cleanup(mdev); } - spin_unlock(&mdev->lock); + spin_unlock_irqrestore(&mdev->lock, flags); } /** From e5173418ac597cebe9f7a39adf10be470000b518 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 27 Sep 2017 10:06:27 +0100 Subject: [PATCH 024/217] netfilter: ipset: Fix race between dump and swap Fix a race between ip_set_dump_start() and ip_set_swap(). The race is as follows: * Without holding the ref lock, ip_set_swap() checks ref_netlink of the set and it is 0. * ip_set_dump_start() takes a reference on the set. * ip_set_swap() does the swap (even though it now has a non-zero reference count). * ip_set_dump_start() gets the set from ip_set_list again which is now a different set since it has been swapped. * ip_set_dump_start() calls __ip_set_put_netlink() and hits a BUG_ON due to the reference count being 0. Fix this race by extending the critical region in which the ref lock is held to include checking the ref counts. The race can be reproduced with the following script: while :; do ipset destroy hash_ip1 ipset destroy hash_ip2 ipset create hash_ip1 hash:ip family inet hashsize 1024 \ maxelem 500000 ipset create hash_ip2 hash:ip family inet hashsize 300000 \ maxelem 500000 ipset create hash_ip3 hash:ip family inet hashsize 1024 \ maxelem 500000 ipset save & ipset swap hash_ip3 hash_ip2 ipset destroy hash_ip3 wait done Signed-off-by: Ross Lagerwall Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index a7f049ff3049..cf84f7b37cd9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; - if (from->ref_netlink || to->ref_netlink) + write_lock_bh(&ip_set_ref_lock); + + if (from->ref_netlink || to->ref_netlink) { + write_unlock_bh(&ip_set_ref_lock); return -EBUSY; + } strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); - write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; From 0d18779be13766b33c69cbc26df38383598da373 Mon Sep 17 00:00:00 2001 From: JingPiao Chen Date: Sat, 23 Sep 2017 17:10:44 +0800 Subject: [PATCH 025/217] netfilter: nf_tables: fix update chain error # nft add table filter # nft add chain filter c1 # nft rename chain filter c1 c2 Error: Could not process rule: No such file or directory rename chain filter c1 c2 ^^^^^^^^^^^^^^^^^^^^^^^^^^ # nft add chain filter c2 # nft rename chain filter c1 c2 # nft list table filter table ip filter { chain c2 { } chain c2 { } } Fixes: 664b0f8cd8 ("netfilter: nf_tables: add generation mask to chains") Signed-off-by: JingPiao Chen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 929927171426..f98ca8c6aa59 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain2)) - return PTR_ERR(chain2); + if (!IS_ERR(chain2)) + return -EEXIST; } if (nla[NFTA_CHAIN_COUNTERS]) { From e6b72ee88a56bcfe63f72e9c30766484c45bec72 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Tue, 26 Sep 2017 18:35:45 +0200 Subject: [PATCH 026/217] netfilter: ebtables: fix race condition in frame_filter_net_init() It is possible for ebt_in_hook to be triggered before ebt_table is assigned resulting in a NULL-pointer dereference. Make sure hooks are registered as the last step. Fixes: aee12a0a3727 ("ebtables: remove nf_hook_register usage") Signed-off-by: Artem Savkov Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 7 ++++--- net/bridge/netfilter/ebtable_broute.c | 4 ++-- net/bridge/netfilter/ebtable_filter.c | 4 ++-- net/bridge/netfilter/ebtable_nat.c | 4 ++-- net/bridge/netfilter/ebtables.c | 17 +++++++++-------- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 2c2a5514b0df..528b24c78308 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -108,9 +108,10 @@ struct ebt_table { #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \ ~(__alignof__(struct _xt_align)-1)) -extern struct ebt_table *ebt_register_table(struct net *net, - const struct ebt_table *table, - const struct nf_hook_ops *); +extern int ebt_register_table(struct net *net, + const struct ebt_table *table, + const struct nf_hook_ops *ops, + struct ebt_table **res); extern void ebt_unregister_table(struct net *net, struct ebt_table *table, const struct nf_hook_ops *); extern unsigned int ebt_do_table(struct sk_buff *skb, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 2585b100ebbb..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { - net->xt.broute_table = ebt_register_table(net, &broute_table, NULL); - return PTR_ERR_OR_ZERO(net->xt.broute_table); + return ebt_register_table(net, &broute_table, NULL, + &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 45a00dbdbcad..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter); - return PTR_ERR_OR_ZERO(net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter, + &net->xt.frame_filter); } static void __net_exit frame_filter_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 57cd5bb154e7..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat); - return PTR_ERR_OR_ZERO(net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat, + &net->xt.frame_nat); } static void __net_exit frame_nat_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83951f978445..3b3dcf719e07 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) kfree(table); } -struct ebt_table * -ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops) +int ebt_register_table(struct net *net, const struct ebt_table *input_table, + const struct nf_hook_ops *ops, struct ebt_table **res) { struct ebt_table_info *newinfo; struct ebt_table *t, *table; @@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, repl->entries == NULL || repl->entries_size == 0 || repl->counters != NULL || input_table->private != NULL) { BUGPRINT("Bad table data for ebt_register_table!!!\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } /* Don't add one table to multiple lists. */ @@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); mutex_unlock(&ebt_mutex); + WRITE_ONCE(*res, table); + if (!ops) - return table; + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); - return ERR_PTR(ret); + *res = NULL; } - return table; + return ret; free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: @@ -1276,7 +1277,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, free_table: kfree(table); out: - return ERR_PTR(ret); + return ret; } void ebt_unregister_table(struct net *net, struct ebt_table *table, From 35c036ef4a722e953e17884f4f4325f78eeab475 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Sep 2017 12:42:13 -0400 Subject: [PATCH 027/217] nfs: RPC_MAX_AUTH_SIZE is in bytes The units of RPC_MAX_AUTH_SIZE is bytes, not 4-byte words. This causes the client to request a larger-than-necessary session replay slot size. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 37c8af003275..14ed9791ec9c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr, * Assumes OPEN is the biggest non-idempotent compound. * 2 is the verifier. */ - max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + - RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2) + * XDR_UNIT + RPC_MAX_AUTH_SIZE; encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); From cdb2e53fd6dc715c5b45d0967fcb6dc574cb28f8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Sep 2017 00:53:46 +0300 Subject: [PATCH 028/217] NFS: Cleanup error handling in nfs_idmap_request_key() nfs_idmap_get_desc() can't actually return zero. But if it did then we would return ERR_PTR(0) which is NULL and the caller, nfs_idmap_get_key(), doesn't expect that so it leads to a NULL pointer dereference. I've cleaned this up by changing the "<=" to "<" so it's more clear that we don't return ERR_PTR(0). Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/nfs4idmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index dd5d27da8c0c..30426c1a1bbd 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); - if (ret <= 0) + if (ret < 0) return ERR_PTR(ret); rkey = request_key(&key_type_id_resolver, desc, ""); From 68ebf8fe3bce8c167cf83fbd681c1eb1ed419c6c Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 22 Sep 2017 07:57:10 -0400 Subject: [PATCH 029/217] NFS: Fix uninitialized rpc_wait_queue Michael Sterrett reports a NULL pointer dereference on NFSv3 mounts when CONFIG_NFS_V4 is not set because the NFS UOC rpc_wait_queue has not been initialized. Move the initialization of the queue out of the CONFIG_NFS_V4 conditional setion. Fixes: 7d6ddf88c4db ("NFS: Add an iocounter wait function for async RPC tasks") Cc: stable@vger.kernel.org # 4.11+ Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index efebe6cf4378..22880ef6d8dd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); - rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void) ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } From d099b8af46f5e1e37182eff988f9373dcc2b0128 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 12:21:14 +0100 Subject: [PATCH 030/217] sunrpc: remove redundant initialization of sock sock is being initialized and then being almost immediately updated hence the initialized value is not being used and is redundant. Remove the initialization. Cleans up clang warning: warning: Value stored to 'sock' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9b5de31aa429..c1841f234a71 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work) struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; + struct socket *sock; int status = -EIO; sock = xs_create_sock(xprt, transport, From 0a47df11bfc31e1ceae7f91cea84d3bff500475d Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 29 Sep 2017 09:36:43 -0400 Subject: [PATCH 031/217] nfs/filelayout: fix oops when freeing filelayout segment Check for a NULL dsaddr in filelayout_free_lseg() before calling nfs4_fl_put_deviceid(). This fixes the following oops: [ 1967.645207] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 1967.646010] IP: [] nfs4_put_deviceid_node+0xa/0x90 [nfsv4] [ 1967.646010] PGD c08bc067 PUD 915d3067 PMD 0 [ 1967.753036] Oops: 0000 [#1] SMP [ 1967.753036] Modules linked in: nfs_layout_nfsv41_files ext4 mbcache jbd2 loop rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache amd64_edac_mod ipmi_ssif edac_mce_amd edac_core kvm_amd sg kvm ipmi_si ipmi_devintf irqbypass pcspkr k8temp ipmi_msghandler i2c_piix4 shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic crct10dif_common amdkfd amd_iommu_v2 radeon i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops mptsas ttm scsi_transport_sas mptscsih drm mptbase serio_raw i2c_core bnx2 dm_mirror dm_region_hash dm_log dm_mod [ 1967.790031] CPU: 2 PID: 1370 Comm: ls Not tainted 3.10.0-709.el7.test.bz1463784.x86_64 #1 [ 1967.790031] Hardware name: IBM BladeCenter LS21 -[7971AC1]-/Server Blade, BIOS -[BAE155AUS-1.10]- 06/03/2009 [ 1967.790031] task: ffff8800c42a3f40 ti: ffff8800c4064000 task.ti: ffff8800c4064000 [ 1967.790031] RIP: 0010:[] [] nfs4_put_deviceid_node+0xa/0x90 [nfsv4] [ 1967.790031] RSP: 0000:ffff8800c4067978 EFLAGS: 00010246 [ 1967.790031] RAX: ffffffffc062f000 RBX: ffff8801d468a540 RCX: dead000000000200 [ 1967.790031] RDX: ffff8800c40679f8 RSI: ffff8800c4067a0c RDI: 0000000000000000 [ 1967.790031] RBP: ffff8800c4067980 R08: ffff8801d468a540 R09: 0000000000000000 [ 1967.790031] R10: 0000000000000000 R11: ffffffffffffffff R12: ffff8801d468a540 [ 1967.790031] R13: ffff8800c40679f8 R14: ffff8801d5645300 R15: ffff880126f15ff0 [ 1967.790031] FS: 00007f11053c9800(0000) GS:ffff88012bd00000(0000) knlGS:0000000000000000 [ 1967.790031] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1967.790031] CR2: 0000000000000030 CR3: 0000000094b55000 CR4: 00000000000007e0 [ 1967.790031] Stack: [ 1967.790031] ffff8801d468a540 ffff8800c4067990 ffffffffc062d2fe ffff8800c40679b0 [ 1967.790031] ffffffffc062b5b4 ffff8800c40679f8 ffff8801d468a540 ffff8800c40679d8 [ 1967.790031] ffffffffc06d39af ffff8800c40679f8 ffff880126f16078 0000000000000001 [ 1967.790031] Call Trace: [ 1967.790031] [] nfs4_fl_put_deviceid+0xe/0x10 [nfs_layout_nfsv41_files] [ 1967.790031] [] filelayout_free_lseg+0x24/0x90 [nfs_layout_nfsv41_files] [ 1967.790031] [] pnfs_free_lseg_list+0x5f/0x80 [nfsv4] [ 1967.790031] [] _pnfs_return_layout+0x157/0x270 [nfsv4] [ 1967.790031] [] nfs4_evict_inode+0x4d/0x70 [nfsv4] [ 1967.790031] [] evict+0xa9/0x180 [ 1967.790031] [] iput+0xf9/0x190 [ 1967.790031] [] nfs_dentry_iput+0x3a/0x50 [nfs] [ 1967.790031] [] shrink_dentry_list+0x20f/0x490 [ 1967.790031] [] d_invalidate+0xd8/0x150 [ 1967.790031] [] nfs_readdir_page_filler+0x40b/0x600 [nfs] [ 1967.790031] [] nfs_readdir_xdr_to_array+0x20d/0x3b0 [nfs] [ 1967.790031] [] ? __mem_cgroup_commit_charge+0xe2/0x2f0 [ 1967.790031] [] ? __add_to_page_cache_locked+0x48/0x170 [ 1967.790031] [] ? nfs_readdir_xdr_to_array+0x3b0/0x3b0 [nfs] [ 1967.790031] [] nfs_readdir_filler+0x22/0x90 [nfs] [ 1967.790031] [] do_read_cache_page+0x7f/0x190 [ 1967.790031] [] ? fillonedir+0xe0/0xe0 [ 1967.790031] [] read_cache_page+0x1c/0x30 [ 1967.790031] [] nfs_readdir+0x1ab/0x6b0 [nfs] [ 1967.790031] [] ? nfs4_xdr_dec_layoutget+0x270/0x270 [nfsv4] [ 1967.790031] [] ? fillonedir+0xe0/0xe0 [ 1967.790031] [] vfs_readdir+0xb0/0xe0 [ 1967.790031] [] SyS_getdents+0x95/0x120 [ 1967.790031] [] system_call_fastpath+0x16/0x1b [ 1967.790031] Code: 90 31 d2 48 89 d0 5d c3 85 f6 74 f5 8d 4e 01 89 f0 f0 0f b1 0f 39 f0 74 e2 89 c6 eb eb 0f 1f 40 00 66 66 66 66 90 55 48 89 e5 53 <48> 8b 47 30 48 89 fb a8 04 74 3b 8b 57 60 83 fa 02 74 19 8d 4a [ 1967.790031] RIP [] nfs4_put_deviceid_node+0xa/0x90 [nfsv4] [ 1967.790031] RSP [ 1967.790031] CR2: 0000000000000030 Signed-off-by: Scott Mayhew Fixes: 1ebf98012792 ("NFS/filelayout: Fix racy setting of fl->dsaddr...") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 44c638b7876c..508126eb49f9 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); + if (fl->dsaddr != NULL) + nfs4_fl_put_deviceid(fl->dsaddr); /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; From e63aaaa6be54c956b9603590ea436b003407bb3e Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 20 Sep 2017 12:31:28 +0530 Subject: [PATCH 032/217] netfilter: nf_tables: Release memory obtained by kasprintf Free memory region, if nf_tables_set_alloc_name is not successful. Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") Signed-off-by: Arvind Yadav Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f98ca8c6aa59..34adedcb239e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2741,8 +2741,10 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; - if (!strcmp(set->name, i->name)) + if (!strcmp(set->name, i->name)) { + kfree(set->name); return -ENFILE; + } } return 0; } From f5d9644c5fca7d8e8972268598bb516a7eae17f9 Mon Sep 17 00:00:00 2001 From: Shrirang Bagul Date: Fri, 29 Sep 2017 12:39:51 +0800 Subject: [PATCH 033/217] USB: serial: qcserial: add Dell DW5818, DW5819 Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74 series which will by default boot with vid 0x413c and pid's 0x81cf, 0x81d0, 0x81d1, 0x81d2. Signed-off-by: Shrirang Bagul Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/qcserial.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index ebc0beea69d6..eb9928963a53 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -174,6 +174,10 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x413c, 0x81b3)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {DEVICE_SWI(0x413c, 0x81b5)}, /* Dell Wireless 5811e QDL */ {DEVICE_SWI(0x413c, 0x81b6)}, /* Dell Wireless 5811e QDL */ + {DEVICE_SWI(0x413c, 0x81cf)}, /* Dell Wireless 5819 */ + {DEVICE_SWI(0x413c, 0x81d0)}, /* Dell Wireless 5819 */ + {DEVICE_SWI(0x413c, 0x81d1)}, /* Dell Wireless 5818 */ + {DEVICE_SWI(0x413c, 0x81d2)}, /* Dell Wireless 5818 */ /* Huawei devices */ {DEVICE_HWI(0x03f0, 0x581d)}, /* HP lt4112 LTE/HSPA+ Gobi 4G Modem (Huawei me906e) */ From 638164a2718f337ea224b747cf5977ef143166a4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Oct 2017 02:50:16 +0800 Subject: [PATCH 034/217] f2fs: fix potential panic during fstrim As Ju Hyung Park reported: "When 'fstrim' is called for manual trim, a BUG() can be triggered randomly with this patch. I'm seeing this issue on both x86 Desktop and arm64 Android phone. On x86 Desktop, this was caused during Ubuntu boot-up. I have a cronjob installed which calls 'fstrim -v /' during boot. On arm64 Android, this was caused during GC looping with 1ms gc_min_sleep_time & gc_max_sleep_time." Root cause of this issue is that f2fs_wait_discard_bios can only be used by f2fs_put_super, because during put_super there must be no other referrers, so it can ignore discard entry's reference count when removing the entry, otherwise in other caller we will hit bug_on in __remove_discard_cmd as there may be other issuer added reference count in discard entry. Thread A Thread B - issue_discard_thread - f2fs_ioc_fitrim - f2fs_trim_fs - f2fs_wait_discard_bios - __issue_discard_cmd - __submit_discard_cmd - __wait_discard_cmd - dc->ref++ - __wait_one_discard_bio - __wait_discard_cmd - __remove_discard_cmd - f2fs_bug_on(sbi, dc->ref) Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de Reported-by: Ju Hyung Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a7c90386947..4b4a72f392be 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2525,7 +2525,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 621b9b3d320b..c695ff462ee6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1210,11 +1210,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2244,7 +2244,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 89f61eb3d167..933c3d529e65 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From ee213fc72fd67d0988525af501534f4cb924d1e9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Oct 2017 08:51:43 -0500 Subject: [PATCH 035/217] kprobes/x86: Set up frame pointer in kprobe trampoline Richard Weinberger saw an unwinder warning when running bcc's opensnoop: WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008 unwind stack type:0 next_sp: (null) mask:0x2 graph_idx:0 ... ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0) ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90) ... A lockdep stack trace was initiated from inside a kprobe handler, when the unwinder noticed a bad frame pointer on the stack. The bad frame pointer is related to the fact that the kprobe optprobe trampoline doesn't save the frame pointer before calling into optimized_callback(). Reported-and-tested-by: Richard Weinberger Signed-off-by: Josh Poimboeuf Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/common.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index db2182d63ed0..3fc0f9a794cb 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -3,6 +3,15 @@ /* Kprobes and Optprobes common header */ +#include + +#ifdef CONFIG_FRAME_POINTER +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ + " mov %" _ASM_SP ", %" _ASM_BP "\n" +#else +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#endif + #ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ @@ -17,7 +26,7 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - " pushq %rbp\n" \ + SAVE_RBP_STRING \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ @@ -48,7 +57,7 @@ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - " pushl %ebp\n" \ + SAVE_RBP_STRING \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ From b664d57f39d01e775204d4f1a7e2f8bda77bc549 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Oct 2017 16:18:02 +0900 Subject: [PATCH 036/217] kprobes/x86: Remove IRQ disabling from jprobe handlers Jprobes actually don't need to disable IRQs while calling handlers, because of how we specify the kernel interface in Documentation/kprobes.txt: ----- Probe handlers are run with preemption disabled. Depending on the architecture and optimization state, handlers may also run with interrupts disabled (e.g., kretprobe handlers and optimized kprobe handlers run without interrupt disabled on x86/x86-64). ----- So let's remove IRQ disabling from jprobes too. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150701508194.32266.14458959863314097305.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f0153714ddac..0742491cbb73 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * raw stack chunk with redzones: */ __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->flags &= ~X86_EFLAGS_IF; - trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); /* From 3dd40cb320fee7c23b574ab821ce140ccd1281c9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Oct 2017 20:10:36 -0500 Subject: [PATCH 037/217] objtool: Upgrade libelf-devel warning to error for CONFIG_ORC_UNWINDER With CONFIG_ORC_UNWINDER, if the user doesn't have libelf-devel installed, and they don't see the make warning, their ORC unwinder will be silently broken. Upgrade the warning to an error. Reported-and-tested-by: Borislav Petkov Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d9dfc39fb8240998820f9efb233d283a1ee96084.1507079417.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cf007a31d575..bc5c79e8e3cf 100644 --- a/Makefile +++ b/Makefile @@ -933,7 +933,11 @@ ifdef CONFIG_STACK_VALIDATION ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_ORC_UNWINDER + $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + else + $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + endif SKIP_STACK_VALIDATION := 1 export SKIP_STACK_VALIDATION endif From b42dc0635bf0a6aa59fe4d7c826796ff659908c7 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 26 Sep 2017 09:18:27 +0300 Subject: [PATCH 038/217] mei: always use domain runtime pm callbacks. This patch fixes a regression caused by the new changes in the "run wake" handlers. The mei devices that support D0i3 are no longer receiving an interrupt after entering runtime suspend state and will stall. pci_dev_run_wake function now returns "true" for some devices (including mei) for which it used to return "false", arguably incorrectly as "run wake" used to mean that wakeup signals can be generated for a device in the working state of the system, so it could not be enabled or disabled before too. MEI maps runtime suspend/resume to its own defined power gating (PG) states, (D0i3 or other depending on generation), hence we need to go around the native PCI runtime service which eventually brings the device into D3cold/hot state, but the mei devices cannot wake up from D3 unlike from D0i3/PG state, which keeps irq running. To get around PCI device native runtime pm, MEI uses runtime pm domain handlers which take precedence. Cc: #4.13+ Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Acked-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/pci-me.c | 21 +++++++++++---------- drivers/misc/mei/pci-txe.c | 30 +++++++++++------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 4ff40d319676..630757a4b36a 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -226,12 +226,15 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pdev->dev_flags |= PCI_DEV_FLAGS_NEEDS_RESUME; /* - * For not wake-able HW runtime pm framework - * can't be used on pci device level. - * Use domain runtime pm callbacks instead. - */ - if (!pci_dev_run_wake(pdev)) - mei_me_set_pm_domain(dev); + * ME maps runtime suspend/resume to D0i states, + * hence we need to go around native PCI runtime service which + * eventually brings the device into D3cold/hot state, + * but the mei device cannot wake up from D3 unlike from D0i3. + * To get around the PCI device native runtime pm, + * ME uses runtime pm domain handlers which take precedence + * over the driver's pm handlers. + */ + mei_me_set_pm_domain(dev); if (mei_pg_is_enabled(dev)) pm_runtime_put_noidle(&pdev->dev); @@ -271,8 +274,7 @@ static void mei_me_shutdown(struct pci_dev *pdev) dev_dbg(&pdev->dev, "shutdown\n"); mei_stop(dev); - if (!pci_dev_run_wake(pdev)) - mei_me_unset_pm_domain(dev); + mei_me_unset_pm_domain(dev); mei_disable_interrupts(dev); free_irq(pdev->irq, dev); @@ -300,8 +302,7 @@ static void mei_me_remove(struct pci_dev *pdev) dev_dbg(&pdev->dev, "stop\n"); mei_stop(dev); - if (!pci_dev_run_wake(pdev)) - mei_me_unset_pm_domain(dev); + mei_me_unset_pm_domain(dev); mei_disable_interrupts(dev); diff --git a/drivers/misc/mei/pci-txe.c b/drivers/misc/mei/pci-txe.c index e38a5f144373..0566f9bfa7de 100644 --- a/drivers/misc/mei/pci-txe.c +++ b/drivers/misc/mei/pci-txe.c @@ -144,12 +144,14 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pdev->dev_flags |= PCI_DEV_FLAGS_NEEDS_RESUME; /* - * For not wake-able HW runtime pm framework - * can't be used on pci device level. - * Use domain runtime pm callbacks instead. - */ - if (!pci_dev_run_wake(pdev)) - mei_txe_set_pm_domain(dev); + * TXE maps runtime suspend/resume to own power gating states, + * hence we need to go around native PCI runtime service which + * eventually brings the device into D3cold/hot state. + * But the TXE device cannot wake up from D3 unlike from own + * power gating. To get around PCI device native runtime pm, + * TXE uses runtime pm domain handlers which take precedence. + */ + mei_txe_set_pm_domain(dev); pm_runtime_put_noidle(&pdev->dev); @@ -186,8 +188,7 @@ static void mei_txe_shutdown(struct pci_dev *pdev) dev_dbg(&pdev->dev, "shutdown\n"); mei_stop(dev); - if (!pci_dev_run_wake(pdev)) - mei_txe_unset_pm_domain(dev); + mei_txe_unset_pm_domain(dev); mei_disable_interrupts(dev); free_irq(pdev->irq, dev); @@ -215,8 +216,7 @@ static void mei_txe_remove(struct pci_dev *pdev) mei_stop(dev); - if (!pci_dev_run_wake(pdev)) - mei_txe_unset_pm_domain(dev); + mei_txe_unset_pm_domain(dev); mei_disable_interrupts(dev); free_irq(pdev->irq, dev); @@ -318,15 +318,7 @@ static int mei_txe_pm_runtime_suspend(struct device *device) else ret = -EAGAIN; - /* - * If everything is okay we're about to enter PCI low - * power state (D3) therefor we need to disable the - * interrupts towards host. - * However if device is not wakeable we do not enter - * D-low state and we need to keep the interrupt kicking - */ - if (!ret && pci_dev_run_wake(pdev)) - mei_disable_interrupts(dev); + /* keep irq on we are staying in D0 */ dev_dbg(&pdev->dev, "rpm: txe: runtime suspend ret=%d\n", ret); From 688cb67839e852740d22cf763e5eafb27d5a6e53 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sun, 24 Sep 2017 11:35:34 +0300 Subject: [PATCH 039/217] mei: me: add gemini lake devices id Add Gemini Lake (GLK) device id. Signed-off-by: Tomas Winkler Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me-regs.h | 2 ++ drivers/misc/mei/pci-me.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index c8307e8b4c16..0ccccbaf530d 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -127,6 +127,8 @@ #define MEI_DEV_ID_BXT_M 0x1A9A /* Broxton M */ #define MEI_DEV_ID_APL_I 0x5A9A /* Apollo Lake I */ +#define MEI_DEV_ID_GLK 0x319A /* Gemini Lake */ + #define MEI_DEV_ID_KBP 0xA2BA /* Kaby Point */ #define MEI_DEV_ID_KBP_2 0xA2BB /* Kaby Point 2 */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 630757a4b36a..78b3172c8e6e 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -93,6 +93,8 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_BXT_M, MEI_ME_PCH8_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_APL_I, MEI_ME_PCH8_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_GLK, MEI_ME_PCH8_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_KBP, MEI_ME_PCH8_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_KBP_2, MEI_ME_PCH8_CFG)}, From 192b2d78722ffea188e5ec6ae5d55010dce05a4b Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Fri, 29 Sep 2017 21:09:36 -0700 Subject: [PATCH 040/217] Drivers: hv: vmbus: Fix bugs in rescind handling This patch addresses the following bugs in the current rescind handling code: 1. Fixes a race condition where we may be invoking hv_process_channel_removal() on an already freed channel. 2. Prevents indefinite wait when rescinding sub-channels by correctly setting the probe_complete state. I would like to thank Dexuan for patiently reviewing earlier versions of this patch and identifying many of the issues fixed here. Greg, please apply this to 4.14-final. Fixes: '54a66265d675 ("Drivers: hv: vmbus: Fix rescind handling")' Signed-off-by: K. Y. Srinivasan Reviewed-by: Dexuan Cui Cc: stable@vger.kernel.org # (4.13 and above) Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 6 +++--- drivers/hv/channel_mgmt.c | 37 ++++++++++++++++++------------------- drivers/hv/vmbus_drv.c | 3 +-- include/linux/hyperv.h | 2 +- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index efd5db743319..894b67ac2cae 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -640,6 +640,7 @@ void vmbus_close(struct vmbus_channel *channel) */ return; } + mutex_lock(&vmbus_connection.channel_mutex); /* * Close all the sub-channels first and then close the * primary channel. @@ -648,16 +649,15 @@ void vmbus_close(struct vmbus_channel *channel) cur_channel = list_entry(cur, struct vmbus_channel, sc_list); vmbus_close_internal(cur_channel); if (cur_channel->rescind) { - mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(cur_channel, + hv_process_channel_removal( cur_channel->offermsg.child_relid); - mutex_unlock(&vmbus_connection.channel_mutex); } } /* * Now close the primary. */ vmbus_close_internal(channel); + mutex_unlock(&vmbus_connection.channel_mutex); } EXPORT_SYMBOL_GPL(vmbus_close); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index bcbb031f7263..018d2e0f8ec5 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -159,7 +159,7 @@ static void vmbus_rescind_cleanup(struct vmbus_channel *channel) spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); - + channel->rescind = true; list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) { @@ -381,14 +381,21 @@ static void vmbus_release_relid(u32 relid) true); } -void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) +void hv_process_channel_removal(u32 relid) { unsigned long flags; - struct vmbus_channel *primary_channel; + struct vmbus_channel *primary_channel, *channel; - BUG_ON(!channel->rescind); BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + /* + * Make sure channel is valid as we may have raced. + */ + channel = relid2channel(relid); + if (!channel) + return; + + BUG_ON(!channel->rescind); if (channel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(channel->target_cpu, @@ -515,6 +522,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) if (!fnew) { if (channel->sc_creation_callback != NULL) channel->sc_creation_callback(newchannel); + newchannel->probe_done = true; return; } @@ -834,7 +842,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) { struct vmbus_channel_rescind_offer *rescind; struct vmbus_channel *channel; - unsigned long flags; struct device *dev; rescind = (struct vmbus_channel_rescind_offer *)hdr; @@ -873,16 +880,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) return; } - spin_lock_irqsave(&channel->lock, flags); - channel->rescind = true; - spin_unlock_irqrestore(&channel->lock, flags); - - /* - * Now that we have posted the rescind state, perform - * rescind related cleanup. - */ - vmbus_rescind_cleanup(channel); - /* * Now wait for offer handling to complete. */ @@ -901,6 +898,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) if (channel->device_obj) { if (channel->chn_rescind_callback) { channel->chn_rescind_callback(channel); + vmbus_rescind_cleanup(channel); return; } /* @@ -909,6 +907,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) */ dev = get_device(&channel->device_obj->device); if (dev) { + vmbus_rescind_cleanup(channel); vmbus_device_unregister(channel->device_obj); put_device(dev); } @@ -921,16 +920,16 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * 1. Close all sub-channels first * 2. Then close the primary channel. */ + mutex_lock(&vmbus_connection.channel_mutex); + vmbus_rescind_cleanup(channel); if (channel->state == CHANNEL_OPEN_STATE) { /* * The channel is currently not open; * it is safe for us to cleanup the channel. */ - mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(channel, - channel->offermsg.child_relid); - mutex_unlock(&vmbus_connection.channel_mutex); + hv_process_channel_removal(rescind->child_relid); } + mutex_unlock(&vmbus_connection.channel_mutex); } } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a9d49f6f6501..937801ac2fe0 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -768,8 +768,7 @@ static void vmbus_device_release(struct device *device) struct vmbus_channel *channel = hv_dev->channel; mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(channel, - channel->offermsg.child_relid); + hv_process_channel_removal(channel->offermsg.child_relid); mutex_unlock(&vmbus_connection.channel_mutex); kfree(hv_dev); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index c458d7b7ad19..6431087816ba 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1403,7 +1403,7 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, const int *srv_version, int srv_vercnt, int *nego_fw_version, int *nego_srv_version); -void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); +void hv_process_channel_removal(u32 relid); void vmbus_setevent(struct vmbus_channel *channel); /* From 512cf465ee01eb23936a9e6ed0b6414eccb00853 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 29 Sep 2017 15:39:49 -0700 Subject: [PATCH 041/217] binder: fix use-after-free in binder_transaction() User-space normally keeps the node alive when creating a transaction since it has a reference to the target. The local strong ref keeps it alive if the sending process dies before the target process processes the transaction. If the source process is malicious or has a reference counting bug, this can fail. In this case, when we attempt to decrement the node in the failure path, the node has already been freed. This is fixed by taking a tmpref on the node while constructing the transaction. To avoid re-acquiring the node lock and inner proc lock to increment the proc's tmpref, a helper is used that does the ref increments on both the node and proc. Signed-off-by: Todd Kjos Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 95 ++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ab34239a76ee..0621a95b8597 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2582,6 +2582,48 @@ static bool binder_proc_transaction(struct binder_transaction *t, return true; } +/** + * binder_get_node_refs_for_txn() - Get required refs on node for txn + * @node: struct binder_node for which to get refs + * @proc: returns @node->proc if valid + * @error: if no @proc then returns BR_DEAD_REPLY + * + * User-space normally keeps the node alive when creating a transaction + * since it has a reference to the target. The local strong ref keeps it + * alive if the sending process dies before the target process processes + * the transaction. If the source process is malicious or has a reference + * counting bug, relying on the local strong ref can fail. + * + * Since user-space can cause the local strong ref to go away, we also take + * a tmpref on the node to ensure it survives while we are constructing + * the transaction. We also need a tmpref on the proc while we are + * constructing the transaction, so we take that here as well. + * + * Return: The target_node with refs taken or NULL if no @node->proc is NULL. + * Also sets @proc if valid. If the @node->proc is NULL indicating that the + * target proc has died, @error is set to BR_DEAD_REPLY + */ +static struct binder_node *binder_get_node_refs_for_txn( + struct binder_node *node, + struct binder_proc **procp, + uint32_t *error) +{ + struct binder_node *target_node = NULL; + + binder_node_inner_lock(node); + if (node->proc) { + target_node = node; + binder_inc_node_nilocked(node, 1, 0, NULL); + binder_inc_node_tmpref_ilocked(node); + node->proc->tmp_ref++; + *procp = node->proc; + } else + *error = BR_DEAD_REPLY; + binder_node_inner_unlock(node); + + return target_node; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -2685,43 +2727,35 @@ static void binder_transaction(struct binder_proc *proc, ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { - binder_inc_node(ref->node, 1, 0, NULL); - target_node = ref->node; + target_node = binder_get_node_refs_for_txn( + ref->node, &target_proc, + &return_error); + } else { + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); - if (target_node == NULL) { - binder_user_error("%d:%d got transaction to invalid handle\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_invalid_target_handle; - } } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; - if (target_node == NULL) { + if (target_node) + target_node = binder_get_node_refs_for_txn( + target_node, &target_proc, + &return_error); + else return_error = BR_DEAD_REPLY; - mutex_unlock(&context->context_mgr_node_lock); - return_error_line = __LINE__; - goto err_no_context_mgr_node; - } - binder_inc_node(target_node, 1, 0, NULL); mutex_unlock(&context->context_mgr_node_lock); } - e->to_node = target_node->debug_id; - binder_node_lock(target_node); - target_proc = target_node->proc; - if (target_proc == NULL) { - binder_node_unlock(target_node); - return_error = BR_DEAD_REPLY; + if (!target_node) { + /* + * return_error is set above + */ + return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); + e->to_node = target_node->debug_id; if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -3071,6 +3105,8 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); + if (target_node) + binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry @@ -3090,6 +3126,8 @@ static void binder_transaction(struct binder_proc *proc, err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); + if (target_node) + binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); @@ -3104,13 +3142,14 @@ static void binder_transaction(struct binder_proc *proc, err_empty_call_stack: err_dead_binder: err_invalid_target_handle: -err_no_context_mgr_node: if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); - if (target_node) + if (target_node) { binder_dec_node(target_node, 1, 0); + binder_dec_node_tmpref(target_node); + } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", From ad670233c9e1d5feb365d870e30083ef1b889177 Mon Sep 17 00:00:00 2001 From: Peng Xu Date: Tue, 3 Oct 2017 23:21:51 +0300 Subject: [PATCH 042/217] nl80211: Define policy for packet pattern attributes Define a policy for packet pattern attributes in order to fix a potential read over the end of the buffer during nla_get_u32() of the NL80211_PKTPAT_OFFSET attribute. Note that the data there can always be read due to SKB allocation (with alignment and struct skb_shared_info at the end), but the data might be uninitialized. This could be used to leak some data from uninitialized vmalloc() memory, but most drivers don't allow an offset (so you'd just get -EINVAL if the data is non-zero) or just allow it with a fixed value - 100 or 128 bytes, so anything above that would get -EINVAL. With brcmfmac the limit is 1500 so (at least) one byte could be obtained. Cc: stable@kernel.org Signed-off-by: Peng Xu Signed-off-by: Jouni Malinen [rewrite description based on SKB allocation knowledge] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 690874293cfc..d396cb61a280 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, }; +/* policy for packet pattern attributes */ +static const struct nla_policy +nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { + [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -10532,7 +10540,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - NULL, info->extack); + nl80211_packet_pattern_policy, + info->extack); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -10781,7 +10790,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL); + nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; From e8fa33a6f6c7688591542db955794b69b8cecc55 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Oct 2017 13:49:12 -0400 Subject: [PATCH 043/217] NFSv4/pnfs: Fix an infinite layoutget loop Since we can now use a lock stateid or a delegation stateid, that differs from the context stateid, we need to change the test in nfs4_layoutget_handle_exception() to take this into account. This fixes an infinite layoutget loop in the NFS client whereby it keeps retrying the initial layoutget using the same broken stateid. Fixes: 70d2f7b1ea19b ("pNFS: Use the standard I/O stateid when...") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6c61e2b99635..f90090e8c959 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || - nfs4_stateid_match_other(&lgp->args.stateid, - &lgp->args.ctx->state->stateid)) { + !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; exception->stateid = &lgp->args.stateid; From 69a330007091ea8a801dd9fcd897ec52f9529586 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Mon, 2 Oct 2017 11:28:35 +0200 Subject: [PATCH 044/217] RAS/CEC: Use the right length for "cec_disable" parse_cec_param() compares a string with "cec_disable" using only 7 characters of the 11-character-long string. The proper solution for this would be: #define CEC_DISABLE "cec_disable" strncmp(str, CEC_DISABLE, strlen(CEC_DISABLE)) but when comparing a string against a string constant strncmp() has no advantage over strcmp() because the comparison is guaranteed to be bound by the string constant. So just replace str strncmp() with strcmp(). [ tglx: Made it use strcmp and updated the changelog ] Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector") Signed-off-by: Nicolas Iooss Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170903075440.30250-1-nicolas.iooss_linux@m4x.org --- drivers/ras/cec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index d0e5d6ee882c..e2c1988cd7c0 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -523,7 +523,7 @@ int __init parse_cec_param(char *str) if (*str == '=') str++; - if (!strncmp(str, "cec_disable", 7)) + if (!strcmp(str, "cec_disable")) ce_arr.disabled = 1; else return 0; From 262e681183ddcdb24d64a2f993e41a226adcec29 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 2 Oct 2017 11:28:36 +0200 Subject: [PATCH 045/217] x86/mce: Hide mca_cfg Now that lguest is gone, put it in the internal header which should be used only by MCA/RAS code. Add missing header guards while at it. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171002092836.22971-3-bp@alien8.de --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mcheck/mce-internal.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 181264989db5..8edac1de2e35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -187,7 +187,6 @@ struct mca_msr_regs { extern struct mce_vendor_flags mce_flags; -extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 098530a93bb7..debb974fd17d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,6 @@ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + #include #include @@ -108,3 +111,7 @@ static inline void mce_work_trigger(void) { } static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif + +extern struct mca_config mca_cfg; + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 40e28ed77fbf..486f640b02ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -28,6 +28,8 @@ #include #include +#include "mce-internal.h" + #define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 From c1fbc0cf81f1c464f5fda322c1104d4bb1da6711 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 5 Oct 2017 14:42:34 +0530 Subject: [PATCH 046/217] perf callchain: Compare dsos (as well) for CCKEY_FUNCTION Two functions from different binaries can have same start address. Thus, comparing only start address in match_chain() leads to inconsistent callchains. Fix this by adding a check for dsos as well. Ex, https://www.spinics.net/lists/linux-perf-users/msg04067.html Reported-by: Alexander Pozdneev Signed-off-by: Ravi Bangoria Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Krister Johansen Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yao Jin Cc: zhangmengting@huawei.com Link: http://lkml.kernel.org/r/20171005091234.5874-1-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index be09d77cade0..a971caf3759d 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -685,6 +685,8 @@ static enum match_result match_chain(struct callchain_cursor_node *node, { struct symbol *sym = node->sym; u64 left, right; + struct dso *left_dso = NULL; + struct dso *right_dso = NULL; if (callchain_param.key == CCKEY_SRCLINE) { enum match_result match = match_chain_srcline(node, cnode); @@ -696,12 +698,14 @@ static enum match_result match_chain(struct callchain_cursor_node *node, if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) { left = cnode->ms.sym->start; right = sym->start; + left_dso = cnode->ms.map->dso; + right_dso = node->map->dso; } else { left = cnode->ip; right = node->ip; } - if (left == right) { + if (left == right && left_dso == right_dso) { if (node->branch) { cnode->branch_count++; From 3346a6a4e5ba8c040360f753b26938cec31a4bdc Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 2 Oct 2017 16:16:13 -0600 Subject: [PATCH 047/217] selftests: x86: sysret_ss_attrs doesn't build on a PIE build sysret_ss_attrs fails to compile leading x86 test run to fail on systems configured to build using PIE by default. Add -no-pie fix it. Relocation might still fail if relocated above 4G. For now this change fixes the build and runs x86 tests. tools/testing/selftests/x86$ make gcc -m64 -o .../tools/testing/selftests/x86/single_step_syscall_64 -O2 -g -std=gnu99 -pthread -Wall single_step_syscall.c -lrt -ldl gcc -m64 -o .../tools/testing/selftests/x86/sysret_ss_attrs_64 -O2 -g -std=gnu99 -pthread -Wall sysret_ss_attrs.c thunks.S -lrt -ldl /usr/bin/ld: /tmp/ccS6pvIh.o: relocation R_X86_64_32S against `.text' can not be used when making a shared object; recompile with -fPIC /usr/bin/ld: final link failed: Nonrepresentable section on output collect2: error: ld returned 1 exit status Makefile:49: recipe for target '.../tools/testing/selftests/x86/sysret_ss_attrs_64' failed make: *** [.../tools/testing/selftests/x86/sysret_ss_attrs_64] Error 1 Suggested-by: Andy Lutomirski Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 97f187e2663f..0a74a20ca32b 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -20,7 +20,7 @@ BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) -CFLAGS := -O2 -g -std=gnu99 -pthread -Wall +CFLAGS := -O2 -g -std=gnu99 -pthread -Wall -no-pie UNAME_M := $(shell uname -m) CAN_BUILD_I386 := $(shell ./check_cc.sh $(CC) trivial_32bit_program.c -m32) From ea344f6a507f8ce92bf1e10f044c6cfc67e4d22b Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 2 Oct 2017 17:02:22 -0600 Subject: [PATCH 048/217] selftests: mqueue: fix regression in silencing output from RUN_TESTS Fix fix regression in silencing output from RUN_TESTS introduced by commit <8230b905a6780c6> selftests: mqueue: Use full path to run tests from Makefile Signed-off-by: Shuah Khan --- tools/testing/selftests/mqueue/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile index 0f5e347b068d..152823b6cb21 100644 --- a/tools/testing/selftests/mqueue/Makefile +++ b/tools/testing/selftests/mqueue/Makefile @@ -5,8 +5,8 @@ TEST_GEN_PROGS := mq_open_tests mq_perf_tests include ../lib.mk override define RUN_TESTS - $(OUTPUT)/mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" - $(OUTPUT)//mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" + @$(OUTPUT)/mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" + @$(OUTPUT)/mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" endef override define EMIT_TESTS From ec572b9e81b1df79147c2e6f69458e65cf248598 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Fri, 29 Sep 2017 15:01:10 +0800 Subject: [PATCH 049/217] nfsd4: define nfsd4_secinfo_no_name_release() Commit 34b1744c91cc ("nfsd4: define ->op_release for compound ops") defined a couple ->op_release functions and run them if necessary. But there's a problem with that is that it reused nfsd4_secinfo_release() as the op_release of OP_SECINFO_NO_NAME, and caused a leak on struct nfsd4_secinfo_no_name in nfsd4_encode_secinfo_no_name(), because there's no .si_exp field in struct nfsd4_secinfo_no_name. I found this because I was unable to umount an ext4 partition after exporting it via NFS & run fsstress on the nfs mount. A simplified reproducer would be: # mount a local-fs device at /mnt/test, and export it via NFS with # fsid=0 export option (this is required) mount /dev/sda5 /mnt/test echo "/mnt/test *(rw,no_root_squash,fsid=0)" >> /etc/exports service nfs restart # locally mount the nfs export with all default, note that I have # nfsv4.1 configured as the default nfs version, because of the # fsid export option, v4 mount would fail and fall back to v3 mount localhost:/mnt/test /mnt/nfs # try to umount the underlying device, but got EBUSY umount /mnt/nfs service nfs stop umount /mnt/test <=== EBUSY here Fixed it by defining a separate nfsd4_secinfo_no_name_release() function as the op_release method of OP_SECINFO_NO_NAME that releases the correct nfsd4_secinfo_no_name structure. Fixes: 34b1744c91cc ("nfsd4: define ->op_release for compound ops") Signed-off-by: Eryu Guan Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3c69db7d4905..8487486ec496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -927,6 +927,13 @@ nfsd4_secinfo_release(union nfsd4_op_u *u) exp_put(u->secinfo.si_exp); } +static void +nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) +{ + if (u->secinfo_no_name.sin_exp) + exp_put(u->secinfo_no_name.sin_exp); +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -2375,7 +2382,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO_NO_NAME] = { .op_func = nfsd4_secinfo_no_name, - .op_release = nfsd4_secinfo_release, + .op_release = nfsd4_secinfo_no_name_release, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", .op_rsize_bop = nfsd4_secinfo_rsize, From 1561b3266ebe029c487a95f92d1a58c03ded84a1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Oct 2017 15:53:47 +0300 Subject: [PATCH 050/217] selftests/net: rxtimestamp: Fix an off by one The > should be >= so that we don't write one element beyond the end of the array. Fixes: 16e781224198 ("selftests/net: Add a test to validate behavior of rx timestamps") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- tools/testing/selftests/networking/timestamping/rxtimestamp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/networking/timestamping/rxtimestamp.c b/tools/testing/selftests/networking/timestamping/rxtimestamp.c index 00f286661dcd..dd4162fc0419 100644 --- a/tools/testing/selftests/networking/timestamping/rxtimestamp.c +++ b/tools/testing/selftests/networking/timestamping/rxtimestamp.c @@ -341,7 +341,7 @@ int main(int argc, char **argv) return 0; case 'n': t = atoi(optarg); - if (t > ARRAY_SIZE(test_cases)) + if (t >= ARRAY_SIZE(test_cases)) error(1, 0, "Invalid test case: %d", t); all_tests = false; test_cases[t].enabled = true; From 265e60a170d0a0ecfc2d20490134ed2c48dd45ab Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 17 Aug 2017 20:42:26 +1000 Subject: [PATCH 051/217] powerpc/64s: Use emergency stack for kernel TM Bad Thing program checks When using transactional memory (TM), the CPU can be in one of six states as far as TM is concerned, encoded in the Machine State Register (MSR). Certain state transitions are illegal and if attempted trigger a "TM Bad Thing" type program check exception. If we ever hit one of these exceptions it's treated as a bug, ie. we oops, and kill the process and/or panic, depending on configuration. One case where we can trigger a TM Bad Thing, is when returning to userspace after a system call or interrupt, using RFID. When this happens the CPU first restores the user register state, in particular r1 (the stack pointer) and then attempts to update the MSR. However the MSR update is not allowed and so we take the program check with the user register state, but the kernel MSR. This tricks the exception entry code into thinking we have a bad kernel stack pointer, because the MSR says we're coming from the kernel, but r1 is pointing to userspace. To avoid this we instead always switch to the emergency stack if we take a TM Bad Thing from the kernel. That way none of the user register values are used, other than for printing in the oops message. This is the fix for CVE-2017-1000255. Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Cyril Bur [mpe: Rewrite change log & comments, tweak asm slightly] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 48da0f5d2f7f..b82586c53560 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -734,7 +734,29 @@ EXC_REAL(program_check, 0x700, 0x100) EXC_VIRT(program_check, 0x4700, 0x100, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + /* + * It's possible to receive a TM Bad Thing type program check with + * userspace register values (in particular r1), but with SRR1 reporting + * that we came from the kernel. Normally that would confuse the bad + * stack logic, and we would report a bad kernel stack pointer. Instead + * we switch to the emergency stack if we're taking a TM Bad Thing from + * the kernel. + */ + li r10,MSR_PR /* Build a mask of MSR_PR .. */ + oris r10,r10,0x200000@h /* .. and SRR1_PROGTM */ + and r10,r10,r12 /* Mask SRR1 with that. */ + srdi r10,r10,8 /* Shift it so we can compare */ + cmpldi r10,(0x200000 >> 8) /* .. with an immediate. */ + bne 1f /* If != go to normal path. */ + + /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack */ + andi. r10,r12,MSR_PR; /* Set CR0 correctly for label */ + /* 3 in EXCEPTION_PROLOG_COMMON */ + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + b 3f /* Jump into the macro !! */ +1: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD From 044215d145a7a8a60ffa8fdc859d110a795fa6ea Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Tue, 22 Aug 2017 17:20:09 -0400 Subject: [PATCH 052/217] powerpc/tm: Fix illegal TM state in signal handler Currently it's possible that on returning from the signal handler through the restore_tm_sigcontexts() code path (e.g. from a signal caught due to a `trap` instruction executed in the middle of an HTM block, or a deliberately constructed sigframe) an illegal TM state (like TS=10 TM=0, i.e. "T0") is set in SRR1 and when `rfid` sets implicitly the MSR register from SRR1 register on return to userspace it causes a TM Bad Thing exception. That illegal state can be set (a) by a malicious user that disables the TM bit by tweaking the bits in uc_mcontext before returning from the signal handler or (b) by a sufficient number of context switches occurring such that the load_tm counter overflows and TM is disabled whilst in the signal handler. This commit fixes the illegal TM state by ensuring that TM bit is always enabled before we return from restore_tm_sigcontexts(). A small comment correction is made as well. Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Gustavo Romero Signed-off-by: Breno Leitao Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/signal_64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c83c115858c1..b2c002993d78 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TM from user context */ + /* pull in MSR TS bits from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + */ + regs->msr |= MSR_TM; + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); From e9516c0813aeb89ebd19ec0ed39fbfcd78b6ef3a Mon Sep 17 00:00:00 2001 From: Mark Santaniello Date: Fri, 6 Oct 2017 01:07:22 -0700 Subject: [PATCH 053/217] perf script: Add missing separator for "-F ip,brstack" (and brstackoff) Prior to commit 55b9b50811ca ("perf script: Support -F brstack,dso and brstacksym,dso"), we were printing a space before the brstack data. It seems that this space was important. Without it, parsing is difficult. Very sorry for the mistake. Notice here how the "ip" and "brstack" run together: $ perf script -F ip,brstack | head -n 1 22e18c40x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 0x22e195d/0x22e1990/P/-/-/0 0x22e18e9/0x22e1943/P/-/-/0 0x22e1a69/0x22e18c0/P/-/-/0 0x22e19f7/0x22e1a20/P/-/-/0 0x22e1910/0x22e19ee/P/-/-/0 0x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 0x22e195d/0x22e1990/P/-/-/0 0x22e18e9/0x22e1943/P/-/-/0 0x22e1a69/0x22e18c0/P/-/-/0 0x22e19f7/0x22e1a20/P/-/-/0 0x22e1910/0x22e19ee/P/-/-/0 0x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 After this diff, sanity is restored: $ perf script -F ip,brstack | head -n 1 22e18c4 0x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 0x22e195d/0x22e1990/P/-/-/0 0x22e18e9/0x22e1943/P/-/-/0 0x22e1a69/0x22e18c0/P/-/-/0 0x22e19f7/0x22e1a20/P/-/-/0 0x22e1910/0x22e19ee/P/-/-/0 0x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 0x22e195d/0x22e1990/P/-/-/0 0x22e18e9/0x22e1943/P/-/-/0 0x22e1a69/0x22e18c0/P/-/-/0 0x22e19f7/0x22e1a20/P/-/-/0 0x22e1910/0x22e19ee/P/-/-/0 0x22e19e2/0x22e190b/P/-/-/0 0x22e19a1/0x22e19d0/P/-/-/0 Signed-off-by: Mark Santaniello Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: 4.13+ Fixes: 55b9b50811ca ("perf script: Support -F brstack,dso and brstacksym,dso") Link: http://lkml.kernel.org/r/20171006080722.3442046-1-marksan@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3d4c3b5e1868..0c977b6e0f8b 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -586,7 +586,7 @@ static void print_sample_brstack(struct perf_sample *sample, thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt); } - printf("0x%"PRIx64, from); + printf(" 0x%"PRIx64, from); if (PRINT_FIELD(DSO)) { printf("("); map__fprintf_dsoname(alf.map, stdout); @@ -681,7 +681,7 @@ static void print_sample_brstackoff(struct perf_sample *sample, if (alt.map && !alt.map->dso->adjust_symbols) to = map__map_ip(alt.map, to); - printf("0x%"PRIx64, from); + printf(" 0x%"PRIx64, from); if (PRINT_FIELD(DSO)) { printf("("); map__fprintf_dsoname(alf.map, stdout); From 5f9bfe0ef622a7bb9707c22ceb4b6451e1e2cb7b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 4 Oct 2017 17:18:27 +0200 Subject: [PATCH 054/217] netfilter: nf_tables: do not dump chain counters if not enabled Chain counters are only enabled on demand since 9f08ea848117, skip them when dumping them via netlink. Fixes: 9f08ea848117 ("netfilter: nf_tables: keep chain counters away from hot path") Reported-by: Johny Mattsson Tested-by: Johny Mattsson Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 34adedcb239e..64e1ee091225 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + if (basechain->stats && nft_dump_stats(skb, basechain->stats)) goto nla_put_failure; } From e466af75c074e76107ae1cd5a2823e9c61894ffb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 02:50:07 -0700 Subject: [PATCH 055/217] netfilter: x_tables: avoid stack-out-of-bounds read in xt_copy_counters_from_user syzkaller reports an out of bound read in strlcpy(), triggered by xt_copy_counters_from_user() Fix this by using memcpy(), then forcing a zero byte at the last position of the destination, as Florian did for the non COMPAT code. Fixes: d7591f0c41ce ("netfilter: x_tables: introduce and use xt_copy_counters_from_user") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c83a3b5e1c6c..d8571f414208 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); - strlcpy(info->name, compat_tmp.name, sizeof(info->name)); + memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; user += sizeof(compat_tmp); } else @@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(info, user, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - info->name[sizeof(info->name) - 1] = '\0'; user += sizeof(*info); } + info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; From 6151b8b37b119e8e3a8401b080d532520c95faf4 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Oct 2017 17:05:49 +0200 Subject: [PATCH 056/217] ppp: fix race in ppp device destruction ppp_release() tries to ensure that netdevices are unregistered before decrementing the unit refcount and running ppp_destroy_interface(). This is all fine as long as the the device is unregistered by ppp_release(): the unregister_netdevice() call, followed by rtnl_unlock(), guarantee that the unregistration process completes before rtnl_unlock() returns. However, the device may be unregistered by other means (like ppp_nl_dellink()). If this happens right before ppp_release() calling rtnl_lock(), then ppp_release() has to wait for the concurrent unregistration code to release the lock. But rtnl_unlock() releases the lock before completing the device unregistration process. This allows ppp_release() to proceed and eventually call ppp_destroy_interface() before the unregistration process completes. Calling free_netdev() on this partially unregistered device will BUG(): ------------[ cut here ]------------ kernel BUG at net/core/dev.c:8141! invalid opcode: 0000 [#1] SMP CPU: 1 PID: 1557 Comm: pppd Not tainted 4.14.0-rc2+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014 Call Trace: ppp_destroy_interface+0xd8/0xe0 [ppp_generic] ppp_disconnect_channel+0xda/0x110 [ppp_generic] ppp_unregister_channel+0x5e/0x110 [ppp_generic] pppox_unbind_sock+0x23/0x30 [pppox] pppoe_connect+0x130/0x440 [pppoe] SYSC_connect+0x98/0x110 ? do_fcntl+0x2c0/0x5d0 SyS_connect+0xe/0x10 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: free_netdev+0x107/0x110 RSP: ffffc28a40573d88 ---[ end trace ed294ff0cc40eeff ]--- We could set the ->needs_free_netdev flag on PPP devices and move the ppp_destroy_interface() logic in the ->priv_destructor() callback. But that'd be quite intrusive as we'd first need to unlink from the other channels and units that depend on the device (the ones that used the PPPIOCCONNECT and PPPIOCATTACH ioctls). Instead, we can just let the netdevice hold a reference on its ppp_file. This reference is dropped in ->priv_destructor(), at the very end of the unregistration process, so that neither ppp_release() nor ppp_disconnect_channel() can call ppp_destroy_interface() in the interim. Reported-by: Beniamino Galvani Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index c3f77e3b7819..e365866600ba 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1339,7 +1339,17 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) static int ppp_dev_init(struct net_device *dev) { + struct ppp *ppp; + netdev_lockdep_set_classes(dev); + + ppp = netdev_priv(dev); + /* Let the netdevice take a reference on the ppp file. This ensures + * that ppp_destroy_interface() won't run before the device gets + * unregistered. + */ + atomic_inc(&ppp->file.refcnt); + return 0; } @@ -1362,6 +1372,15 @@ static void ppp_dev_uninit(struct net_device *dev) wake_up_interruptible(&ppp->file.rwait); } +static void ppp_dev_priv_destructor(struct net_device *dev) +{ + struct ppp *ppp; + + ppp = netdev_priv(dev); + if (atomic_dec_and_test(&ppp->file.refcnt)) + ppp_destroy_interface(ppp); +} + static const struct net_device_ops ppp_netdev_ops = { .ndo_init = ppp_dev_init, .ndo_uninit = ppp_dev_uninit, @@ -1387,6 +1406,7 @@ static void ppp_setup(struct net_device *dev) dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + dev->priv_destructor = ppp_dev_priv_destructor; netif_keep_dst(dev); } From 532f419cde077ffe9616c97902af177fbb868b17 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 11:35:39 +0200 Subject: [PATCH 057/217] crypto: stm32 - Try to fix hash padding gcc warns that the length for the extra unaligned data in the hash function may be used unaligned. In theory this could happen if we pass a zero-length sg_list, or if sg_is_last() was never true: In file included from drivers/crypto/stm32/stm32-hash.c:23: drivers/crypto/stm32/stm32-hash.c: In function 'stm32_hash_one_request': include/uapi/linux/kernel.h:12:49: error: 'ncp' may be used uninitialized in this function [-Werror=maybe-uninitialized] #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) Neither of these can happen in practice, so the warning is harmless. However while trying to suppress the warning, I noticed multiple problems with that code: - On big-endian kernels, we byte-swap the data like we do for register accesses, however this is a data stream and almost certainly needs to use a single writesl() instead of series of writel() to give the correct hash. - If the length is not a multiple of four bytes, we skip the last word entirely, since we write the truncated length using stm32_hash_set_nblw(). - If we change the code to round the length up rather than down, the last bytes contain stale data, so it needs some form of padding. This tries to address all four problems, by correctly initializing the length to zero, using endian-safe copy functions, adding zero-padding and passing the padded length. I have done no testing on this patch, so please review carefully and if possible test with an unaligned length and big-endian kernel builds. Fixes: 8a1012d3f2ab ("crypto: stm32 - Support for STM32 HASH module") Signed-off-by: Arnd Bergmann Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-hash.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index b585ce54a802..4835dd4a9e50 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -553,9 +553,9 @@ static int stm32_hash_dma_send(struct stm32_hash_dev *hdev) { struct stm32_hash_request_ctx *rctx = ahash_request_ctx(hdev->req); struct scatterlist sg[1], *tsg; - int err = 0, len = 0, reg, ncp; + int err = 0, len = 0, reg, ncp = 0; unsigned int i; - const u32 *buffer = (const u32 *)rctx->buffer; + u32 *buffer = (void *)rctx->buffer; rctx->sg = hdev->req->src; rctx->total = hdev->req->nbytes; @@ -620,10 +620,13 @@ static int stm32_hash_dma_send(struct stm32_hash_dev *hdev) reg |= HASH_CR_DMAA; stm32_hash_write(hdev, HASH_CR, reg); - for (i = 0; i < DIV_ROUND_UP(ncp, sizeof(u32)); i++) - stm32_hash_write(hdev, HASH_DIN, buffer[i]); - - stm32_hash_set_nblw(hdev, ncp); + if (ncp) { + memset(buffer + ncp, 0, + DIV_ROUND_UP(ncp, sizeof(u32)) - ncp); + writesl(hdev->io_base + HASH_DIN, buffer, + DIV_ROUND_UP(ncp, sizeof(u32))); + } + stm32_hash_set_nblw(hdev, DIV_ROUND_UP(ncp, sizeof(u32))); reg = stm32_hash_read(hdev, HASH_STR); reg |= HASH_STR_DCAL; stm32_hash_write(hdev, HASH_STR, reg); From 5125e4e867ab888f2d4b443a1ce463adefb370db Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Tue, 26 Sep 2017 08:17:44 +0200 Subject: [PATCH 058/217] crypto: xts - Fix an error handling path in 'create()' All error handling paths 'goto err_drop_spawn' except this one. In order to avoid some resources leak, we should do it as well here. Fixes: f1c131b45410 ("crypto: xts - Convert to skcipher") Signed-off-by: Christophe JAILLET Signed-off-by: Herbert Xu --- crypto/xts.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crypto/xts.c b/crypto/xts.c index d86c11a8c882..e31828ed0046 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -554,8 +554,10 @@ static int create(struct crypto_template *tmpl, struct rtattr **tb) ctx->name[len - 1] = 0; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, - "xts(%s)", ctx->name) >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; + "xts(%s)", ctx->name) >= CRYPTO_MAX_ALG_NAME) { + err = -ENAMETOOLONG; + goto err_drop_spawn; + } } else goto err_drop_spawn; From 9039f3ef446e9ffa200200c934f049add9e58426 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 3 Oct 2017 10:25:22 +0800 Subject: [PATCH 059/217] crypto: shash - Fix a sleep-in-atomic bug in shash_setkey_unaligned The SCTP program may sleep under a spinlock, and the function call path is: sctp_generate_t3_rtx_event (acquire the spinlock) sctp_do_sm sctp_side_effects sctp_cmd_interpreter sctp_make_init_ack sctp_pack_cookie crypto_shash_setkey shash_setkey_unaligned kmalloc(GFP_KERNEL) For the same reason, the orinoco driver may sleep in interrupt handler, and the function call path is: orinoco_rx_isr_tasklet orinoco_rx orinoco_mic crypto_shash_setkey shash_setkey_unaligned kmalloc(GFP_KERNEL) To fix it, GFP_KERNEL is replaced with GFP_ATOMIC. This bug is found by my static analysis tool and my code review. Signed-off-by: Jia-Ju Bai Signed-off-by: Herbert Xu --- crypto/shash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/shash.c b/crypto/shash.c index 5e31c8d776df..8fcecc66741d 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -41,7 +41,7 @@ static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, int err; absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); - buffer = kmalloc(absize, GFP_KERNEL); + buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; From 0cabf2af6f5ac3c88cb106c4e06087a5a39b8e1e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 7 Oct 2017 11:29:48 +0800 Subject: [PATCH 060/217] crypto: skcipher - Fix crash on zero-length input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skcipher walk interface doesn't handle zero-length input properly as the old blkcipher walk interface did. This is due to the fact that the length check is done too late. This patch moves the length check forward so that it does the right thing. Fixes: b286d8b1a690 ("crypto: skcipher - Add skcipher walk...") Cc: Reported-by: Stephan Müller Signed-off-by: Herbert Xu --- crypto/skcipher.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 4faa0fd53b0c..d5692e35fab1 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -426,14 +426,9 @@ static int skcipher_copy_iv(struct skcipher_walk *walk) static int skcipher_walk_first(struct skcipher_walk *walk) { - walk->nbytes = 0; - if (WARN_ON_ONCE(in_irq())) return -EDEADLK; - if (unlikely(!walk->total)) - return 0; - walk->buffer = NULL; if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { int err = skcipher_copy_iv(walk); @@ -452,10 +447,15 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk, { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + walk->total = req->cryptlen; + walk->nbytes = 0; + + if (unlikely(!walk->total)) + return 0; + scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); - walk->total = req->cryptlen; walk->iv = req->iv; walk->oiv = req->iv; @@ -509,6 +509,11 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk, struct crypto_aead *tfm = crypto_aead_reqtfm(req); int err; + walk->nbytes = 0; + + if (unlikely(!walk->total)) + return 0; + walk->flags &= ~SKCIPHER_WALK_PHYS; scatterwalk_start(&walk->in, req->src); From 80ac93c274411a55ae731f259f75e4ca5e499e8b Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 3 Oct 2017 11:17:05 -0500 Subject: [PATCH 061/217] gpio: omap: Fix lost edge interrupts Now acking of edge irqs happens the following way: - omap_gpio_irq_handler - "isr" = read irq status - omap_clear_gpio_irqbank(bank, isr_saved & ~level_mask); ^ clear edge status, so irq can be accepted - loop while "isr" generic_handle_irq() - handle_edge_irq() - desc->irq_data.chip->irq_ack(&desc->irq_data); - omap_gpio_ack_irq() it might be that at this moment edge IRQ was triggered again and it will be cleared and IRQ will be lost. Use handle_simple_irq and clear edge interrupts early without disabling them in omap_gpio_irq_handler to avoid loosing interrupts. [1] https://marc.info/?l=linux-omap&m=149004465313534&w=2 Signed-off-by: Grygorii Strashko Signed-off-by: Ladislav Michl Signed-off-by: Linus Walleij --- drivers/gpio/gpio-omap.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index 22d7d4838265..3233b72b6828 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -518,7 +518,13 @@ static int omap_gpio_irq_type(struct irq_data *d, unsigned type) if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) irq_set_handler_locked(d, handle_level_irq); else if (type & (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING)) - irq_set_handler_locked(d, handle_edge_irq); + /* + * Edge IRQs are already cleared/acked in irq_handler and + * not need to be masked, as result handle_edge_irq() + * logic is excessed here and may cause lose of interrupts. + * So just use handle_simple_irq. + */ + irq_set_handler_locked(d, handle_simple_irq); return 0; @@ -678,7 +684,7 @@ static void omap_gpio_free(struct gpio_chip *chip, unsigned offset) static irqreturn_t omap_gpio_irq_handler(int irq, void *gpiobank) { void __iomem *isr_reg = NULL; - u32 isr; + u32 enabled, isr, level_mask; unsigned int bit; struct gpio_bank *bank = gpiobank; unsigned long wa_lock_flags; @@ -691,23 +697,21 @@ static irqreturn_t omap_gpio_irq_handler(int irq, void *gpiobank) pm_runtime_get_sync(bank->chip.parent); while (1) { - u32 isr_saved, level_mask = 0; - u32 enabled; - raw_spin_lock_irqsave(&bank->lock, lock_flags); enabled = omap_get_gpio_irqbank_mask(bank); - isr_saved = isr = readl_relaxed(isr_reg) & enabled; + isr = readl_relaxed(isr_reg) & enabled; if (bank->level_mask) level_mask = bank->level_mask & enabled; + else + level_mask = 0; /* clear edge sensitive interrupts before handler(s) are called so that we don't miss any interrupt occurred while executing them */ - omap_disable_gpio_irqbank(bank, isr_saved & ~level_mask); - omap_clear_gpio_irqbank(bank, isr_saved & ~level_mask); - omap_enable_gpio_irqbank(bank, isr_saved & ~level_mask); + if (isr & ~level_mask) + omap_clear_gpio_irqbank(bank, isr & ~level_mask); raw_spin_unlock_irqrestore(&bank->lock, lock_flags); From a2d3f3e33853ef52e5f66b41c3e8ee5710aa3305 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 5 Oct 2017 19:03:05 +0200 Subject: [PATCH 062/217] ipv6: fix net.ipv6.conf.all.accept_dad behaviour for real Commit 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") was intended to affect accept_dad flag handling in such a way that DAD operation and mode on a given interface would be selected according to the maximum value of conf/{all,interface}/accept_dad. However, addrconf_dad_begin() checks for particular cases in which we need to skip DAD, and this check was modified in the wrong way. Namely, it was modified so that, if the accept_dad flag is 0 for the given interface *or* for all interfaces, DAD would be skipped. We have instead to skip DAD if accept_dad is 0 for the given interface *and* for all interfaces. Fixes: 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") Acked-by: Stefano Brivio Signed-off-by: Matteo Croce Reported-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..4a96ebbf8eda 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3820,8 +3820,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; From 00a534e5ea5c21b95f58cbb2f7918cc9fa82dd47 Mon Sep 17 00:00:00 2001 From: Axel Beckert Date: Thu, 5 Oct 2017 22:00:33 +0200 Subject: [PATCH 063/217] doc: Fix typo "8023.ad" in bonding documentation Should be "802.3ad" like everywhere else in the document. Signed-off-by: Axel Beckert Signed-off-by: David S. Miller --- Documentation/networking/bonding.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 57f52cdce32e..9ba04c0bab8d 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -2387,7 +2387,7 @@ broadcast: Like active-backup, there is not much advantage to this and packet type ID), so in a "gatewayed" configuration, all outgoing traffic will generally use the same device. Incoming traffic may also end up on a single device, but that is - dependent upon the balancing policy of the peer's 8023.ad + dependent upon the balancing policy of the peer's 802.3ad implementation. In a "local" configuration, traffic will be distributed across the devices in the bond. From 8fe2d6ccd52b086268f2f36e5e2fc0fe3aeffa80 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 Oct 2017 16:20:56 -0700 Subject: [PATCH 064/217] bpf: fix liveness marking while processing Rx = Ry instruction the verifier does regs[insn->dst_reg] = regs[insn->src_reg] which often clears write mark (when Ry doesn't have it) that was just set by check_reg_arg(Rx) prior to the assignment. That causes mark_reg_read() to keep marking Rx in this block as REG_LIVE_READ (since the logic incorrectly misses that it's screened by the write) and in many of its parents (until lucky write into the same Rx or beginning of the program). That causes is_state_visited() logic to miss many pruning opportunities. Furthermore mark_reg_read() logic propagates the read mark for BPF_REG_FP as well (though it's readonly) which causes harmless but unnecssary work during is_state_visited(). Note that do_propagate_liveness() skips FP correctly, so do the same in mark_reg_read() as well. It saves 0.2 seconds for the test below program before after bpf_lb-DLB_L3.o 2604 2304 bpf_lb-DLB_L4.o 11159 3723 bpf_lb-DUNKNOWN.o 1116 1110 bpf_lxc-DDROP_ALL.o 34566 28004 bpf_lxc-DUNKNOWN.o 53267 39026 bpf_netdev.o 17843 16943 bpf_overlay.o 8672 7929 time ~11 sec ~4 sec Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning") Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..8b8d6ba39e23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; + if (regno == BPF_REG_FP) + /* We don't need to worry about FP liveness because it's read-only */ + return; + while (parent) { /* if read wasn't screened by an earlier write ... */ if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -2345,6 +2349,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ regs[insn->dst_reg] = regs[insn->src_reg]; + regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { From 845e405e5e6c9dc9ed10306a4b5bfeaefebc2e84 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 3 Oct 2017 12:00:49 -0500 Subject: [PATCH 065/217] pinctrl: cherryview: fix issues caused by dynamic gpio irqs mapping New GPIO IRQs are allocated and mapped dynamically by default when GPIO IRQ infrastructure is used by cherryview-pinctrl driver. This causes issues on some Intel platforms [1][2] with broken BIOS which hardcodes Linux IRQ numbers in their ACPI tables. On such platforms cherryview-pinctrl driver should allocate and map all GPIO IRQs at probe time. Side effect - "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n" can be seen at boot log. NOTE. It still may fail if boot sequence will changed and some interrupt controller will be probed before cherryview-pinctrl which will shift Linux IRQ numbering (expected with CONFIG_SPARCE_IRQ enabled). [1] https://bugzilla.kernel.org/show_bug.cgi?id=194945 [2] https://lkml.org/lkml/2017/9/28/153 Cc: Andy Shevchenko Cc: Chris Gorman Cc: Mika Westerberg Cc: Heikki Krogerus Signed-off-by: Grygorii Strashko Reported-by: Chris Gorman Reported-by: Mika Westerberg Tested-by: Chris Gorman Acked-by: Mika Westerberg Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-cherryview.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c index 04e929fd0ffe..fadbca907c7c 100644 --- a/drivers/pinctrl/intel/pinctrl-cherryview.c +++ b/drivers/pinctrl/intel/pinctrl-cherryview.c @@ -1577,6 +1577,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) struct gpio_chip *chip = &pctrl->chip; bool need_valid_mask = !dmi_check_system(chv_no_valid_mask); int ret, i, offset; + int irq_base; *chip = chv_gpio_chip; @@ -1622,7 +1623,18 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) /* Clear all interrupts */ chv_writel(0xffff, pctrl->regs + CHV_INTSTAT); - ret = gpiochip_irqchip_add(chip, &chv_gpio_irqchip, 0, + if (!need_valid_mask) { + irq_base = devm_irq_alloc_descs(pctrl->dev, -1, 0, + chip->ngpio, NUMA_NO_NODE); + if (irq_base < 0) { + dev_err(pctrl->dev, "Failed to allocate IRQ numbers\n"); + return irq_base; + } + } else { + irq_base = 0; + } + + ret = gpiochip_irqchip_add(chip, &chv_gpio_irqchip, irq_base, handle_bad_irq, IRQ_TYPE_NONE); if (ret) { dev_err(pctrl->dev, "failed to add IRQ chip\n"); From a69518cf0b4cbf02c6bc1239cdeb8750a9eb8077 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 8 Oct 2017 11:53:26 +0200 Subject: [PATCH 066/217] mlxsw: spectrum_router: Avoid expensive lookup during route removal In commit fc922bb0dd94 ("mlxsw: spectrum_router: Use one LPM tree for all virtual routers") I increased the scale of supported VRFs by having all of them share the same LPM tree. In order to avoid look-ups for prefix lengths that don't exist, each route removal would trigger an aggregation across all the active virtual routers to see which prefix lengths are in use and which aren't and structure the tree accordingly. With the way the data structures are currently laid out, this is a very expensive operation. When preformed repeatedly - due to the invocation of the abort mechanism - and with enough VRFs, this can result in a hung task. For now, avoid this optimization until it can be properly re-added in net-next. Fixes: fc922bb0dd94 ("mlxsw: spectrum_router: Use one LPM tree for all virtual routers") Signed-off-by: Ido Schimmel Reported-by: David Ahern Tested-by: David Ahern Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 032089efc1a0..c16718d296d3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3505,20 +3505,6 @@ static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp, static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib *fib) { - struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } }; - struct mlxsw_sp_lpm_tree *lpm_tree; - - /* Aggregate prefix lengths across all virtual routers to make - * sure we only have used prefix lengths in the LPM tree. - */ - mlxsw_sp_vrs_prefixes(mlxsw_sp, fib->proto, &req_prefix_usage); - lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage, - fib->proto); - if (IS_ERR(lpm_tree)) - goto err_tree_get; - mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree); - -err_tree_get: if (!mlxsw_sp_prefix_usage_none(&fib->prefix_usage)) return; mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib); From 3d0241d57c7b25bb75ac9d7a62753642264fdbce Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 6 Oct 2017 19:02:35 +0300 Subject: [PATCH 067/217] gso: fix payload length when gso_size is zero When gso_size reset to zero for the tail segment in skb_segment(), later in ipv6_gso_segment(), __skb_udp_tunnel_segment() and gre_gso_segment() we will get incorrect results (payload length, pcsum) for that segment. inet_gso_segment() already has a check for gso_size before calculating payload. The issue was found with LTP vxlan & gre tests over ixgbe NIC. Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer") Signed-off-by: Alexey Kodanev Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); From 3382605fd8db1ed1fb03f3f1529490133fe3ab08 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 7 Oct 2017 14:32:49 +0200 Subject: [PATCH 068/217] tipc: correct initialization of skb list We change the initialization of the skb transmit buffer queues in the functions tipc_bcast_xmit() and tipc_rcast_xmit() to also initialize their spinlocks. This is needed because we may, during error conditions, need to call skb_queue_purge() on those queues further down the stack. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 7d99029df342..a140dd4a84af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&xmitq); + skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -263,7 +263,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, u32 dst, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - __skb_queue_head_init(&_pkts); + skb_queue_head_init(&_pkts); list_for_each_entry_safe(n, tmp, &dests->list, list) { dst = n->value; From a9e2971b8cd3ef469de0112ba15778b5b98ad72e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 7 Oct 2017 15:07:20 +0200 Subject: [PATCH 069/217] tipc: Unclone message at secondary destination lookup When a bundling message is received, the function tipc_link_input() calls function tipc_msg_extract() to unbundle all inner messages of the bundling message before adding them to input queue. The function tipc_msg_extract() just clones all inner skb for all inner messagges from the bundling skb. This means that the skb headroom of an inner message overlaps with the data part of the preceding message in the bundle. If the message in question is a name addressed message, it may be subject to a secondary destination lookup, and eventually be sent out on one of the interfaces again. But, since what is perceived as headroom by the device driver in reality is the last bytes of the preceding message in the bundle, the latter will be overwritten by the MAC addresses of the L2 header. If the preceding message has not yet been consumed by the user, it will evenually be delivered with corrupted contents. This commit fixes this by uncloning all messages passing through the function tipc_msg_lookup_dest(), hence ensuring that the headroom is always valid when the message is passed on. Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 121e59a1d0e7..17146c16ee2d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; + + if (!skb_cloned(skb)) + return true; + + /* Unclone buffer in case it was bundled */ + if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) + return false; + return true; } From bd998c2e0df0469707503023d50d46cf0b10c787 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 4 Oct 2017 11:01:12 +0200 Subject: [PATCH 070/217] USB: serial: console: fix use-after-free on disconnect A clean-up patch removing two redundant NULL-checks from the console disconnect handler inadvertently also removed a third check. This could lead to the struct usb_serial being prematurely freed by the console code when a driver accepts but does not register any ports for an interface which also lacks endpoint descriptors. Fixes: 0e517c93dc02 ("USB: serial: console: clean up sanity checks") Cc: stable # 4.11 Reported-by: Andrey Konovalov Acked-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index fdf89800ebc3..ed8ba3ef5c79 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -265,7 +265,7 @@ static struct console usbcons = { void usb_serial_console_disconnect(struct usb_serial *serial) { - if (serial->port[0] == usbcons_info.port) { + if (serial->port[0] && serial->port[0] == usbcons_info.port) { usb_serial_console_exit(); usb_serial_put(serial); } From 299d7572e46f98534033a9e65973f13ad1ce9047 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 4 Oct 2017 11:01:13 +0200 Subject: [PATCH 071/217] USB: serial: console: fix use-after-free after failed setup Make sure to reset the USB-console port pointer when console setup fails in order to avoid having the struct usb_serial be prematurely freed by the console code when the device is later disconnected. Fixes: 73e487fdb75f ("[PATCH] USB console: fix disconnection issues") Cc: stable # 2.6.18 Acked-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/console.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index ed8ba3ef5c79..43a862a90a77 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -186,6 +186,7 @@ static int usb_console_setup(struct console *co, char *options) tty_kref_put(tty); reset_open_count: port->port.count = 0; + info->port = NULL; usb_autopm_put_interface(serial->interface); error_get_interface: usb_serial_put(serial); From 49f817d793d1bcc11d721881aac037b996feef5c Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 00:44:03 +0800 Subject: [PATCH 072/217] netfilter: SYNPROXY: skip non-tcp packet in {ipv4, ipv6}_synproxy_hook In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet, but the real server maybe reply an icmp error packet related to the exist tcp conntrack, so we will access wrong tcp data. Fix it by checking for the protocol field and only process tcp traffic. Signed-off-by: Lin Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_SYNPROXY.c | 3 ++- net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 811689e523c3..f75fc6b53115 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (synproxy == NULL) return NF_ACCEPT; - if (nf_is_loopback_packet(skb)) + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index a5cd43d75393..437af8c95277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (thoff < 0) + if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); From 19e1d4e947cac3b5e08225d15ad7744e691c7376 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Oct 2017 12:41:36 +0200 Subject: [PATCH 073/217] genirq: Warn when effective affinity is not updated Emit a one time warning when the effective affinity mask is enabled in Kconfig, but the interrupt chip does not update the mask in its irq_set_affinity() callback, Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/manage.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..ef89f7246656 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +static void irq_validate_effective_affinity(struct irq_data *data) +{ +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + const struct cpumask *m = irq_data_get_effective_affinity_mask(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + + if (!cpumask_empty(m)) + return; + pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", + chip->name, data->irq); +#endif +} + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -181,6 +194,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: + irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } From 60b09c51bb4fb46e2331fdbb39f91520f31d35f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Oct 2017 12:47:24 +0200 Subject: [PATCH 074/217] genirq/cpuhotplug: Add sanity check for effective affinity mask The effective affinity mask handling has no safety net when the mask is not updated by the interrupt chip or the mask contains offline CPUs. If that happens the CPU unplug code fails to migrate interrupts. Add sanity checks and emit a warning when the mask contains only offline CPUs. Fixes: 415fcf1a2293 ("genirq/cpuhotplug: Use effective affinity mask") Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Christoph Hellwig Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/cpuhotplug.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c83d9f..9eb09aef0313 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,8 +18,34 @@ static inline bool irq_needs_fixup(struct irq_data *d) { const struct cpumask *m = irq_data_get_effective_affinity_mask(d); + unsigned int cpu = smp_processor_id(); - return cpumask_test_cpu(smp_processor_id(), m); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + /* + * The cpumask_empty() check is a workaround for interrupt chips, + * which do not implement effective affinity, but the architecture has + * enabled the config switch. Use the general affinity mask instead. + */ + if (cpumask_empty(m)) + m = irq_data_get_affinity_mask(d); + + /* + * Sanity check. If the mask is not empty when excluding the outgoing + * CPU then it must contain at least one online CPU. The outgoing CPU + * has been removed from the online mask already. + */ + if (cpumask_any_but(m, cpu) < nr_cpu_ids && + cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { + /* + * If this happens then there was a missed IRQ fixup at some + * point. Warn about it and enforce fixup. + */ + pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", + cpumask_pr_args(m), d->irq, cpu); + return true; + } +#endif + return cpumask_test_cpu(cpu, m); } static bool migrate_one_irq(struct irq_desc *desc) From e43b3b58548051f8809391eb7bec7a27ed3003ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Oct 2017 21:07:38 +0200 Subject: [PATCH 075/217] genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs Managed interrupts can end up in a stale state on CPU hotplug. If the interrupt is not targeting a single CPU, i.e. the affinity mask spawns multiple CPUs then the following can happen: After boot: dstate: 0x01601200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 0 After offlining CPU 31 - 24 dstate: 0x01a31000 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 24 pending: 0 Now CPU 25 gets onlined again, so it should get the effective interrupt affinity for this interruopt, but due to the x86 interrupt affinity setter restrictions this ends up after restarting the interrupt with: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 So the interrupt is still affine to CPU 24, which was the last CPU to go offline of that affinity set and the move to an online CPU within 24-31, in this case 25, is pending. This mechanism is x86/ia64 specific as those architectures cannot move interrupts from thread context and do this when an interrupt is actually handled. So the move is set to pending. Whats worse is that offlining CPU 25 again results in: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 This means the interrupt has not been shut down, because the outgoing CPU is not in the effective affinity mask, but of course nothing notices that the effective affinity mask is pointing at an offline CPU. In the case of restarting a managed interrupt the move restriction does not apply, so the affinity setting can be made unconditional. This needs to be done _before_ the interrupt is started up as otherwise the condition for moving it from thread context would not longer be fulfilled. With that change applied onlining CPU 25 after offlining 31-24 results in: dstate: 0x01600200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 25 pending: And after offlining CPU 25: dstate: 0x01a30000 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 25 pending: which is the correct and expected result. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Reported-by: YASUAKI ISHIMATSU Signed-off-by: Thomas Gleixner Cc: axboe@kernel.dk Cc: linux-scsi@vger.kernel.org Cc: Sumit Saxena Cc: Marc Zyngier Cc: mpe@ellerman.id.au Cc: Shivasharan Srikanteshwara Cc: Kashyap Desai Cc: keith.busch@intel.com Cc: peterz@infradead.org Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..5a2ef92c2782 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: + irq_do_set_affinity(d, aff, false); ret = __irq_startup(desc); - irq_set_affinity_locked(d, aff, false); break; case IRQ_STARTUP_ABORT: return 0; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef89f7246656..4bff6a10ae8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -188,6 +188,9 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_chip *chip = irq_data_get_irq_chip(data); int ret; + if (!chip || !chip->irq_set_affinity) + return -EINVAL; + ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: From 924c6b900cfdf376b07bccfd80e62b21914f8a5a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 8 Oct 2017 21:53:05 -0700 Subject: [PATCH 076/217] x86/mm/64: Fix reboot interaction with CR4.PCIDE Trying to reboot via real mode fails with PCID on: long mode cannot be exited while CR4.PCIDE is set. (No, I have no idea why, but the SDM and actual CPUs are in agreement here.) The result is a GPF and a hang instead of a reboot. I didn't catch this in testing because neither my computer nor my VM reboots this way. I can trigger it with reboot=bios, though. Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems") Reported-and-tested-by: Steven Rostedt (VMware) Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org --- arch/x86/kernel/reboot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 54180fa6f66f..add33f600531 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type) load_cr3(initial_page_table); #else write_cr3(real_mode_header->trampoline_pgd); + + /* Exiting long mode will fail if CR4.PCIDE is set. */ + if (static_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); #endif /* Jump to the identity-mapped low memory code */ From 6b32c126d33d5cb379bca280ab8acedc1ca978ff Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 Oct 2017 20:30:12 +0200 Subject: [PATCH 077/217] x86/alternatives: Fix alt_max_short macro to really be a max() The alt_max_short() macro in asm/alternative.h does not work as intended, leading to nasty bugs. E.g. alt_max_short("1", "3") evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not exactly the maximum of 1 and 3. In fact, I had to learn it the hard way by crashing my kernel in not so funny ways by attempting to make use of the ALTENATIVE_2 macro with alternatives where the first one was larger than the second one. According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") the right handed side should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the macro work as intended. While at it, fix up the comments regarding the additional "-", too. It's not about gas' usage of s32 but brain dead logic of having a "true" value of -1 for the < operator ... *sigh* Btw., the one in asm/alternative-asm.h is correct. And, apparently, all current users of ALTERNATIVE_2() pass same sized alternatives, avoiding to hit the bug. [1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax Reviewed-and-tested-by: Borislav Petkov Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") Signed-off-by: Mathias Krause Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com --- arch/x86/include/asm/alternative-asm.h | 4 +++- arch/x86/include/asm/alternative.h | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index c096624137ae..ccbe24e697c4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is From c247487c0dd6fefff6ed0cbcbe66f037721755fb Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 6 Oct 2017 02:04:06 +0800 Subject: [PATCH 078/217] ALSA: usb-audio: Add sample rate quirk for Plantronics P610 Like other Plantronics devices, P610 does not support sample rate reading. Apply sample rate quirk to it. BugLink: https://bugs.launchpad.net/bugs/1719853 Signed-off-by: Kai-Heng Feng Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b8cb57aeec77..9ddaae3784f5 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1138,6 +1138,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ case USB_ID(0x047F, 0xC022): /* Plantronics C310 */ + case USB_ID(0x047F, 0xC02F): /* Plantronics P610 */ case USB_ID(0x047F, 0xC036): /* Plantronics C520-M */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ From 5803b023881857db32ffefa0d269c90280a67ee0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 10:02:56 +0200 Subject: [PATCH 079/217] ALSA: seq: Fix copy_from_user() call inside lock The event handler in the virmidi sequencer code takes a read-lock for the linked list traverse, while it's calling snd_seq_dump_var_event() in the loop. The latter function may expand the user-space data depending on the event type. It eventually invokes copy_from_user(), which might be a potential dead-lock. The sequencer core guarantees that the user-space data is passed only with atomic=0 argument, but snd_virmidi_dev_receive_event() ignores it and always takes read-lock(). For avoiding the problem above, this patch introduces rwsem for non-atomic case, while keeping rwlock for atomic case. Also while we're at it: the superfluous irq flags is dropped in snd_virmidi_input_open(). Reported-by: Jia-Ju Bai Cc: Signed-off-by: Takashi Iwai --- include/sound/seq_virmidi.h | 1 + sound/core/seq/seq_virmidi.c | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/sound/seq_virmidi.h b/include/sound/seq_virmidi.h index a03acd0d398a..695257ae64ac 100644 --- a/include/sound/seq_virmidi.h +++ b/include/sound/seq_virmidi.h @@ -60,6 +60,7 @@ struct snd_virmidi_dev { int port; /* created/attached port */ unsigned int flags; /* SNDRV_VIRMIDI_* */ rwlock_t filelist_lock; + struct rw_semaphore filelist_sem; struct list_head filelist; }; diff --git a/sound/core/seq/seq_virmidi.c b/sound/core/seq/seq_virmidi.c index 8d93a4021c78..f48a4cd24ffc 100644 --- a/sound/core/seq/seq_virmidi.c +++ b/sound/core/seq/seq_virmidi.c @@ -77,13 +77,17 @@ static void snd_virmidi_init_event(struct snd_virmidi *vmidi, * decode input event and put to read buffer of each opened file */ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev, - struct snd_seq_event *ev) + struct snd_seq_event *ev, + bool atomic) { struct snd_virmidi *vmidi; unsigned char msg[4]; int len; - read_lock(&rdev->filelist_lock); + if (atomic) + read_lock(&rdev->filelist_lock); + else + down_read(&rdev->filelist_sem); list_for_each_entry(vmidi, &rdev->filelist, list) { if (!vmidi->trigger) continue; @@ -97,7 +101,10 @@ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev, snd_rawmidi_receive(vmidi->substream, msg, len); } } - read_unlock(&rdev->filelist_lock); + if (atomic) + read_unlock(&rdev->filelist_lock); + else + up_read(&rdev->filelist_sem); return 0; } @@ -115,7 +122,7 @@ int snd_virmidi_receive(struct snd_rawmidi *rmidi, struct snd_seq_event *ev) struct snd_virmidi_dev *rdev; rdev = rmidi->private_data; - return snd_virmidi_dev_receive_event(rdev, ev); + return snd_virmidi_dev_receive_event(rdev, ev, true); } #endif /* 0 */ @@ -130,7 +137,7 @@ static int snd_virmidi_event_input(struct snd_seq_event *ev, int direct, rdev = private_data; if (!(rdev->flags & SNDRV_VIRMIDI_USE)) return 0; /* ignored */ - return snd_virmidi_dev_receive_event(rdev, ev); + return snd_virmidi_dev_receive_event(rdev, ev, atomic); } /* @@ -209,7 +216,6 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream) struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_rawmidi_runtime *runtime = substream->runtime; struct snd_virmidi *vmidi; - unsigned long flags; vmidi = kzalloc(sizeof(*vmidi), GFP_KERNEL); if (vmidi == NULL) @@ -223,9 +229,11 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream) vmidi->client = rdev->client; vmidi->port = rdev->port; runtime->private_data = vmidi; - write_lock_irqsave(&rdev->filelist_lock, flags); + down_write(&rdev->filelist_sem); + write_lock_irq(&rdev->filelist_lock); list_add_tail(&vmidi->list, &rdev->filelist); - write_unlock_irqrestore(&rdev->filelist_lock, flags); + write_unlock_irq(&rdev->filelist_lock); + up_write(&rdev->filelist_sem); vmidi->rdev = rdev; return 0; } @@ -264,9 +272,11 @@ static int snd_virmidi_input_close(struct snd_rawmidi_substream *substream) struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_virmidi *vmidi = substream->runtime->private_data; + down_write(&rdev->filelist_sem); write_lock_irq(&rdev->filelist_lock); list_del(&vmidi->list); write_unlock_irq(&rdev->filelist_lock); + up_write(&rdev->filelist_sem); snd_midi_event_free(vmidi->parser); substream->runtime->private_data = NULL; kfree(vmidi); @@ -520,6 +530,7 @@ int snd_virmidi_new(struct snd_card *card, int device, struct snd_rawmidi **rrmi rdev->rmidi = rmidi; rdev->device = device; rdev->client = -1; + init_rwsem(&rdev->filelist_sem); rwlock_init(&rdev->filelist_lock); INIT_LIST_HEAD(&rdev->filelist); rdev->seq_mode = SNDRV_VIRMIDI_SEQ_DISPATCH; From 78279127253a6c36ed8829eb2b7bc28ef48d9717 Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Mon, 9 Oct 2017 14:46:41 +0800 Subject: [PATCH 080/217] drm/atomic: Unref duplicated drm_atomic_state in drm_atomic_helper_resume() Kmemleak reported memory leak after suspend and resume: unreferenced object 0xffffffc0e31d8880 (size 128): comm "bash", pid 181, jiffies 4294763583 (age 24.694s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 20 a2 eb c0 ff ff ff ......... ...... 01 00 00 00 00 00 00 00 80 87 1d e3 c0 ff ff ff ................ backtrace: [] __save_stack_trace+0x48/0x6c [] create_object+0x138/0x254 [] kmemleak_alloc+0x58/0x8c [] kmem_cache_alloc_trace+0x188/0x254 [] drm_atomic_state_alloc+0x3c/0x88 [] drm_atomic_helper_duplicate_state+0x28/0x158 [] drm_atomic_helper_suspend+0x5c/0xf0 Problem here is that we are duplicating the drm_atomic_state in drm_atomic_helper_suspend(), but not unreference it in the resume path. Fixes: 1494276000db ("drm/atomic-helper: Implement subsystem-level suspend/resume") Signed-off-by: Jeffy Chen Reviewed-by: Maarten Lankhorst Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20171009064641.15174-1-jeffy.chen@rock-chips.com Fixes: 0853695c3ba4 ("drm: Add reference counting to drm_atomic_state") Cc: # v4.10+ (cherry picked from commit 6d281b1f79e194c02125da29ea77316810261ca8) --- drivers/gpu/drm/drm_atomic_helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 4e53aae9a1fb..0028591f3f95 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -2960,6 +2960,7 @@ int drm_atomic_helper_resume(struct drm_device *dev, drm_modeset_backoff(&ctx); } + drm_atomic_state_put(state); drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); From 94c3390ab84a6b449accc7351ffda4a0c17bdb92 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Wed, 27 Sep 2017 09:14:58 +0100 Subject: [PATCH 081/217] MIPS: bpf: Fix uninitialised target compiler error Compiling ebpf_jit.c with gcc 4.9 results in a (likely spurious) compiler warning, as gcc has detected that the variable "target" may be used uninitialised. Since -Werror is active, this is treated as an error and causes a kernel build failure whenever CONFIG_MIPS_EBPF_JIT is enabled. arch/mips/net/ebpf_jit.c: In function 'build_one_insn': arch/mips/net/ebpf_jit.c:1118:80: error: 'target' may be used uninitialized in this function [-Werror=maybe-uninitialized] emit_instr(ctx, j, target); ^ cc1: all warnings being treated as errors Fix this by initialising "target" to 0. If it really is used uninitialised this would result in a jump to 0 and a detectable run time failure. Signed-off-by: Matt Redfearn Fixes: b6bd53f9c4e8 ("MIPS: Add missing file for eBPF JIT.") Cc: James Hogan Cc: David Daney Cc: David S. Miller Cc: Colin Ian King Cc: Daniel Borkmann Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: # v4.13+ Patchwork: https://patchwork.linux-mips.org/patch/17375/ Signed-off-by: Ralf Baechle --- arch/mips/net/ebpf_jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 7646891c4e9b..01b7a87ea678 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -667,7 +667,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, { int src, dst, r, td, ts, mem_off, b_off; bool need_swap, did_move, cmp_eq; - unsigned int target; + unsigned int target = 0; u64 t64; s64 t64s; int bpf_op = BPF_OP(insn->code); From 1b6ad6df8b4fd723d8d98b670b0d7772402e4e34 Mon Sep 17 00:00:00 2001 From: Kelvin Cheung Date: Fri, 6 Oct 2017 21:13:18 +0800 Subject: [PATCH 082/217] MIPS: loongson1: set default number of rx and tx queues for stmmac Set the default number of RX and TX queues due to the recent changes of stmmac driver. Otherwise the ethernet will crash once it starts. Signed-off-by: Kelvin Cheung Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17452/ Signed-off-by: Ralf Baechle --- arch/mips/loongson32/common/platform.c | 38 ++++++++++++++------------ 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/mips/loongson32/common/platform.c b/arch/mips/loongson32/common/platform.c index 100f23dfa438..ac584c5823d0 100644 --- a/arch/mips/loongson32/common/platform.c +++ b/arch/mips/loongson32/common/platform.c @@ -183,18 +183,20 @@ int ls1x_eth_mux_init(struct platform_device *pdev, void *priv) } static struct plat_stmmacenet_data ls1x_eth0_pdata = { - .bus_id = 0, - .phy_addr = -1, + .bus_id = 0, + .phy_addr = -1, #if defined(CONFIG_LOONGSON1_LS1B) - .interface = PHY_INTERFACE_MODE_MII, + .interface = PHY_INTERFACE_MODE_MII, #elif defined(CONFIG_LOONGSON1_LS1C) - .interface = PHY_INTERFACE_MODE_RMII, + .interface = PHY_INTERFACE_MODE_RMII, #endif - .mdio_bus_data = &ls1x_mdio_bus_data, - .dma_cfg = &ls1x_eth_dma_cfg, - .has_gmac = 1, - .tx_coe = 1, - .init = ls1x_eth_mux_init, + .mdio_bus_data = &ls1x_mdio_bus_data, + .dma_cfg = &ls1x_eth_dma_cfg, + .has_gmac = 1, + .tx_coe = 1, + .rx_queues_to_use = 1, + .tx_queues_to_use = 1, + .init = ls1x_eth_mux_init, }; static struct resource ls1x_eth0_resources[] = { @@ -222,14 +224,16 @@ struct platform_device ls1x_eth0_pdev = { #ifdef CONFIG_LOONGSON1_LS1B static struct plat_stmmacenet_data ls1x_eth1_pdata = { - .bus_id = 1, - .phy_addr = -1, - .interface = PHY_INTERFACE_MODE_MII, - .mdio_bus_data = &ls1x_mdio_bus_data, - .dma_cfg = &ls1x_eth_dma_cfg, - .has_gmac = 1, - .tx_coe = 1, - .init = ls1x_eth_mux_init, + .bus_id = 1, + .phy_addr = -1, + .interface = PHY_INTERFACE_MODE_MII, + .mdio_bus_data = &ls1x_mdio_bus_data, + .dma_cfg = &ls1x_eth_dma_cfg, + .has_gmac = 1, + .tx_coe = 1, + .rx_queues_to_use = 1, + .tx_queues_to_use = 1, + .init = ls1x_eth_mux_init, }; static struct resource ls1x_eth1_resources[] = { From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 9 Oct 2017 15:27:15 +0300 Subject: [PATCH 083/217] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1' Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced support for attaching an eBPF object by an fd, with the 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each IPT_SO_SET_REPLACE call. However this breaks subsequent iptables calls: # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT # iptables -A INPUT -s 5.6.7.8 -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. That's because iptables works by loading existing rules using IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with the replacement set. However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number (from the initial "iptables -m bpf" invocation) - so when 2nd invocation occurs, userspace passes a bogus fd number, which leads to 'bpf_mt_check_v1' to fail. One suggested solution [1] was to hack iptables userspace, to perform a "entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new, process-local fd per every 'xt_bpf_info_v1' entry seen. However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects. This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given '.fd' and instead perform an in-kernel lookup for the bpf object given the provided '.path'. It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is expected to provide the path of the pinned object. Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved. References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2 [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2 Reported-by: Rafael Buchbinder Signed-off-by: Shmulik Ladkani Acked-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/linux/bpf.h | 5 +++++ include/uapi/linux/netfilter/xt_bpf.h | 1 + kernel/bpf/inode.c | 1 + net/netfilter/xt_bpf.c | 22 ++++++++++++++++++++-- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8390859e79e7..f1af7d63d678 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -368,6 +368,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) { } +static inline int bpf_obj_get_user(const char __user *pathname) +{ + return -EOPNOTSUPP; +} + static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index b97725af2ac0..da161b56c79e 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -23,6 +23,7 @@ enum xt_bpf_modes { XT_BPF_MODE_FD_PINNED, XT_BPF_MODE_FD_ELF, }; +#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED struct xt_bpf_info_v1 { __u16 mode; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ int bpf_obj_get_user(const char __user *pathname) putname(pname); return ret; } +EXPORT_SYMBOL_GPL(bpf_obj_get_user); static void bpf_evict_inode(struct inode *inode) { diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 38986a95216c..29123934887b 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) return 0; } +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + mm_segment_t oldfs = get_fs(); + int retval, fd; + + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); + if (fd < 0) + return fd; + + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; +} + static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) return __bpf_mt_check_bytecode(info->bpf_program, info->bpf_program_num_elem, &info->filter); - else if (info->mode == XT_BPF_MODE_FD_PINNED || - info->mode == XT_BPF_MODE_FD_ELF) + else if (info->mode == XT_BPF_MODE_FD_ELF) return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); else return -EINVAL; } From cb02ffc76a53b5ea751b79b8d4f4d180e5868475 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 14:32:15 +0200 Subject: [PATCH 084/217] ALSA: line6: Fix missing initialization before error path The error path in podhd_init() tries to clear the pending timer, while the timer object is initialized at the end of init sequence, thus it may hit the uninitialized object, as spotted by syzkaller: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 1 PID: 1845 Comm: kworker/1:2 Not tainted 4.14.0-rc2-42613-g1488251d1a98 #238 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 register_lock_class+0x6c4/0x1a00 kernel/locking/lockdep.c:769 __lock_acquire+0x27e/0x4550 kernel/locking/lockdep.c:3385 lock_acquire+0x259/0x620 kernel/locking/lockdep.c:4002 del_timer_sync+0x12c/0x280 kernel/time/timer.c:1237 podhd_disconnect+0x8c/0x160 sound/usb/line6/podhd.c:299 line6_probe+0x844/0x1310 sound/usb/line6/driver.c:783 podhd_probe+0x64/0x70 sound/usb/line6/podhd.c:474 .... For addressing it, assure the initializations of timer and work by moving them to the beginning of podhd_init(). Fixes: 790869dacc3d ("ALSA: line6: Add support for POD X3") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/line6/podhd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/usb/line6/podhd.c b/sound/usb/line6/podhd.c index 956f847a96e4..14ab82ea7e82 100644 --- a/sound/usb/line6/podhd.c +++ b/sound/usb/line6/podhd.c @@ -317,6 +317,9 @@ static int podhd_init(struct usb_line6 *line6, line6->disconnect = podhd_disconnect; + init_timer(&pod->startup_timer); + INIT_WORK(&pod->startup_work, podhd_startup_workqueue); + if (pod->line6.properties->capabilities & LINE6_CAP_CONTROL) { /* claim the data interface */ intf = usb_ifnum_to_if(line6->usbdev, @@ -358,8 +361,6 @@ static int podhd_init(struct usb_line6 *line6, } /* init device and delay registering */ - init_timer(&pod->startup_timer); - INIT_WORK(&pod->startup_work, podhd_startup_workqueue); podhd_startup(pod); return 0; } From 54a4b2b45817ea2365b40c923c098a26af0c0dbb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 14:26:27 +0200 Subject: [PATCH 085/217] ALSA: line6: Fix NULL dereference at podhd_disconnect() When podhd_init() failed with the acquiring a ctrl i/f, the line6 helper still calls the disconnect callback that eventually calls again usb_driver_release_interface() with the NULL intf. Put the proper NULL check before calling it for avoiding an Oops. Fixes: fc90172ba283 ("ALSA: line6: Claim pod x3 usb data interface") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/line6/podhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/line6/podhd.c b/sound/usb/line6/podhd.c index 14ab82ea7e82..451007c27743 100644 --- a/sound/usb/line6/podhd.c +++ b/sound/usb/line6/podhd.c @@ -301,7 +301,8 @@ static void podhd_disconnect(struct usb_line6 *line6) intf = usb_ifnum_to_if(line6->usbdev, pod->line6.properties->ctrl_if); - usb_driver_release_interface(&podhd_driver, intf); + if (intf) + usb_driver_release_interface(&podhd_driver, intf); } } From c95072b3d88fac4be295815f2b67df366c0c297f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 14:51:23 +0200 Subject: [PATCH 086/217] ALSA: line6: Fix leftover URB at error-path during probe While line6_probe() may kick off URB for a control MIDI endpoint, the function doesn't clean up it properly at its error path. This results in a leftover URB action that is eventually triggered later and causes an Oops like: general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 0 Comm: swapper/1 Not tainted RIP: 0010:usb_fill_bulk_urb ./include/linux/usb.h:1619 RIP: 0010:line6_start_listen+0x3fe/0x9e0 sound/usb/line6/driver.c:76 Call Trace: line6_data_received+0x1f7/0x470 sound/usb/line6/driver.c:326 __usb_hcd_giveback_urb+0x2e0/0x650 drivers/usb/core/hcd.c:1779 usb_hcd_giveback_urb+0x337/0x420 drivers/usb/core/hcd.c:1845 dummy_timer+0xba9/0x39f0 drivers/usb/gadget/udc/dummy_hcd.c:1965 call_timer_fn+0x2a2/0x940 kernel/time/timer.c:1281 .... Since the whole clean-up procedure is done in line6_disconnect() callback, we can simply call it in the error path instead of open-coding the whole again. It'll fix such an issue automagically. The bug was spotted by syzkaller. Fixes: eedd0e95d355 ("ALSA: line6: Don't forget to call driver's destructor at error path") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/line6/driver.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index 0ff5a7d2e19f..c8f723c3a033 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -779,9 +779,10 @@ int line6_probe(struct usb_interface *interface, return 0; error: - if (line6->disconnect) - line6->disconnect(line6); - snd_card_free(card); + /* we can call disconnect callback here because no close-sync is + * needed yet at this point + */ + line6_disconnect(interface); return ret; } EXPORT_SYMBOL_GPL(line6_probe); From 133d68e0ed4d822ccfa276ff7d2d1753477de1a8 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 1 Sep 2017 14:46:50 -0700 Subject: [PATCH 087/217] MIPS: Fix cmpxchg on 32b signed ints for 64b kernel with !kernel_uses_llsc Commit 8263db4d7768 ("MIPS: cmpxchg: Implement __cmpxchg() as a function") refactored our implementation of __cmpxchg() to be a function rather than a macro, with the aim of making it easier to read & modify. Unfortunately the commit breaks use of cmpxchg() for signed 32 bit values when we have a 64 bit kernel with kernel_uses_llsc == false, because: - In cmpxchg_local() we cast the old value to the type the pointer points to, and then to an unsigned long. If the pointer points to a signed type smaller than 64 bits then the old value will be sign extended to 64 bits. That is, bits beyond the size of the pointed to type will be set to 1 if the old value is negative. In the case of a signed 32 bit integer with a negative value, bits 63:32 will all be set. - In __cmpxchg_asm() we load the value from memory, ie. dereference the pointer, and store the value as an unsigned integer (__ret) whose size matches the pointer. For a 32 bit cmpxchg() this means we store the value in a u32, because the pointer provided to __cmpxchg_asm() by __cmpxchg() is of type volatile u32 *. - __cmpxchg_asm() then checks whether the value in memory (__ret) matches the provided old value, by comparing the two values. This results in the u32 being promoted to a 64 bit unsigned long to match the old argument - however because both types are unsigned the value is zero extended, which does not match the sign extension performed on the old value in cmpxchg_local() earlier. This mismatch means that unfortunate cmpxchg() calls can incorrectly fail for 64 bit kernels with kernel_uses_llsc == false. This is the case on at least non-SMP Cavium Octeon kernels, which hardcode kernel_uses_llsc in their cpu-feature-overrides.h header. Using a v4.13-rc7 kernel configured using cavium_octeon_defconfig with SMP manually disabled, this presents itself as oddity when we reach userland - for example: can't run '/bin/mount': Text file busy can't run '/bin/mkdir': Text file busy can't run '/bin/mkdir': Text file busy can't run '/bin/mount': Text file busy can't run '/bin/hostname': Text file busy can't run '/etc/init.d/rcS': Text file busy can't run '/sbin/getty': Text file busy can't run '/sbin/getty': Text file busy It appears that some part of the init process, which is in this case buildroot's busybox init, is running successfully. It never manages to reach the login prompt though, and complains about /sbin/getty being busy repeatedly and indefinitely. Fix this by casting the old value provided to __cmpxchg_asm() to an appropriately sized unsigned integer, such that we consistently zero-extend avoiding the mismatch. The __cmpxchg_small() case for 8 & 16 bit values is unaffected because __cmpxchg_small() already masks provided values appropriately. Signed-off-by: Paul Burton Fixes: 8263db4d7768 ("MIPS: cmpxchg: Implement __cmpxchg() as a function") Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17226/ Cc: linux-mips@linux-mips.org Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cmpxchg.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h index 903f3bf48419..7e25c5cc353a 100644 --- a/arch/mips/include/asm/cmpxchg.h +++ b/arch/mips/include/asm/cmpxchg.h @@ -155,14 +155,16 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, return __cmpxchg_small(ptr, old, new, size); case 4: - return __cmpxchg_asm("ll", "sc", (volatile u32 *)ptr, old, new); + return __cmpxchg_asm("ll", "sc", (volatile u32 *)ptr, + (u32)old, new); case 8: /* lld/scd are only available for MIPS64 */ if (!IS_ENABLED(CONFIG_64BIT)) return __cmpxchg_called_with_bad_pointer(); - return __cmpxchg_asm("lld", "scd", (volatile u64 *)ptr, old, new); + return __cmpxchg_asm("lld", "scd", (volatile u64 *)ptr, + (u64)old, new); default: return __cmpxchg_called_with_bad_pointer(); From e1270575fb7ee7ed6058d4ad3714a3b28001a295 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Sun, 3 Sep 2017 10:24:58 -0700 Subject: [PATCH 088/217] MIPS: Fix generic-board-config.sh for builds using O= When configuring the kernel using one of the generic MIPS defconfig targets, the generic-board-config.sh script is used to check requirements listed in board config fragments against a reference config in order to determine which board config fragments to merge into the final config. When specifying O= to configure in a directory other than the kernel source directory, this generic-board-config.sh script is invoked in the directory that we are configuring in (ie. the directory that O equals), and the path to the reference config is relative to the current directory. The script then changes the current directory to the source tree, which unfortunately breaks later access to the reference file since its path is relative to a directory that is no longer the current working directory. This results in configuration failing with errors such as: $ make ARCH=mips O=tmp 32r2_defconfig make[1]: Entering directory '/home/pburton/src/linux/tmp' Using ../arch/mips/configs/generic_defconfig as base Merging ../arch/mips/configs/generic/32r2.config Merging ../arch/mips/configs/generic/eb.config grep: ./.config.32r2_defconfig: No such file or directory grep: ./.config.32r2_defconfig: No such file or directory The base file '.config' does not exist. Exit. make[1]: *** [arch/mips/Makefile:505: 32r2_defconfig] Error 1 make[1]: Leaving directory '/home/pburton/src/linux-ingenic/tmp' make: *** [Makefile:145: sub-make] Error 2 Fix this by avoiding changing the working directory in generic-board-config.sh, instead using full paths to files under $(srctree)/ where necessary. Signed-off-by: Paul Burton Fixes: 27e0d4b05107 ("MIPS: generic: Allow filtering enabled boards by requirements") Cc: linux-mips@linux-mips.org Cc: kbuild test robot Cc: kbuild-all@01.org Patchwork: https://patchwork.linux-mips.org/patch/17231/ Signed-off-by: Ralf Baechle --- arch/mips/tools/generic-board-config.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/mips/tools/generic-board-config.sh b/arch/mips/tools/generic-board-config.sh index 5c4f93687039..654d652d7fa1 100755 --- a/arch/mips/tools/generic-board-config.sh +++ b/arch/mips/tools/generic-board-config.sh @@ -30,8 +30,6 @@ cfg="$4" boards_origin="$5" shift 5 -cd "${srctree}" - # Only print Skipping... lines if the user explicitly specified BOARDS=. In the # general case it only serves to obscure the useful output about what actually # was included. @@ -48,7 +46,7 @@ environment*) esac for board in $@; do - board_cfg="arch/mips/configs/generic/board-${board}.config" + board_cfg="${srctree}/arch/mips/configs/generic/board-${board}.config" if [ ! -f "${board_cfg}" ]; then echo "WARNING: Board config '${board_cfg}' not found" continue @@ -84,7 +82,7 @@ for board in $@; do done || continue # Merge this board config fragment into our final config file - ./scripts/kconfig/merge_config.sh \ + ${srctree}/scripts/kconfig/merge_config.sh \ -m -O ${objtree} ${cfg} ${board_cfg} \ | grep -Ev '^(#|Using)' done From ca8eb05b5f332a9e1ab3e2ece498d49f4d683470 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 8 Sep 2017 15:12:21 -0700 Subject: [PATCH 089/217] MIPS: math-emu: Remove pr_err() calls from fpu_emu() The FPU emulator includes 2 calls to pr_err() which are triggered by invalid instruction encodings for MIPSr6 cmp.cond.fmt instructions. These cases are not kernel errors, merely invalid instructions which are already handled by delivering a SIGILL which will provide notification that something failed in cases where that makes sense. In cases where that SIGILL is somewhat expected & being handled, for example when crashme happens to generate one of the affected bad encodings, the message is printed with no useful context about what triggered it & spams the kernel log for no good reason. Remove the pr_err() calls to make crashme run silently & treat the bad encodings the same way we do others, with a SIGILL & no further kernel log output. Signed-off-by: Paul Burton Fixes: f8c3c6717a71 ("MIPS: math-emu: Add support for the CMP.condn.fmt R6 instruction") Cc: linux-mips@linux-mips.org Cc: stable # v4.3+ Patchwork: https://patchwork.linux-mips.org/patch/17253/ Signed-off-by: Ralf Baechle --- arch/mips/math-emu/cp1emu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 192542dbd972..16d9ef5a78c5 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -2558,7 +2558,6 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.S operation\n"); return SIGILL; } } @@ -2719,7 +2718,6 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.D operation\n"); return SIGILL; } } From e0f06bba9629987fb3ec1d6928bf17ef689702e8 Mon Sep 17 00:00:00 2001 From: Mark D Rustad Date: Wed, 31 Aug 2016 10:34:28 -0700 Subject: [PATCH 090/217] ixgbe: Return error when getting PHY address if PHY access is not supported In cases where PHY register access is not supported, don't mislead a caller into thinking that it is supported by returning a PHY address. Instead, return -EOPNOTSUPP when PHY access is not supported. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d962368d08d0..822cdb4f2c25 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8529,6 +8529,10 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) return ixgbe_ptp_set_ts_config(adapter, req); case SIOCGHWTSTAMP: return ixgbe_ptp_get_ts_config(adapter, req); + case SIOCGMIIPHY: + if (!adapter->hw.phy.ops.read_reg) + return -EOPNOTSUPP; + /* fall through */ default: return mdio_mii_ioctl(&adapter->hw.phy.mdio, if_mii(req), cmd); } From a39221ce969b316d3c3dcf7fcff8c0d8cf223007 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 3 Jul 2017 13:02:55 +0200 Subject: [PATCH 091/217] ixgbe: fix masking of bits read from IXGBE_VXLANCTRL register In ixgbe_clear_udp_tunnel_port(), we read the IXGBE_VXLANCTRL register and then try to mask some bits out of the value, using the logical instead of bitwise and operator. Fixes: a21d0822ff69 ("ixgbe: add support for geneve Rx offload") Signed-off-by: Sabrina Dubroca Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 822cdb4f2c25..4d76afd13868 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -4881,7 +4881,7 @@ static void ixgbe_clear_udp_tunnel_port(struct ixgbe_adapter *adapter, u32 mask) IXGBE_FLAG_GENEVE_OFFLOAD_CAPABLE))) return; - vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) && ~mask; + vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) & ~mask; IXGBE_WRITE_REG(hw, IXGBE_VXLANCTRL, vxlanctrl); if (mask & IXGBE_VXLANCTRL_VXLAN_UDPPORT_MASK) From f4986d250ada29ae0c65c209a9d8f97968ea7eae Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Fri, 18 Aug 2017 14:21:04 +0800 Subject: [PATCH 092/217] Revert commit 1a8b6d76dc5b ("net:add one common config...") The new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added to indicate that Relaxed Ordering Attributes (RO) should not be used for Transaction Layer Packets (TLP) targeted toward these affected Root Port, it will clear the bit4 in the PCIe Device Control register, so the PCIe device drivers could query PCIe configuration space to determine if it can send TLPs to Root Port with the Relaxed Ordering Attributes set. With this new flag we don't need the config ARCH_WANT_RELAX_ORDER to control the Relaxed Ordering Attributes for the ixgbe drivers just like the commit 1a8b6d76dc5b ("net:add one common config...") did, so revert this commit. Signed-off-by: Ding Tianhong Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- arch/Kconfig | 3 --- arch/sparc/Kconfig | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 1aafb4efbb51..d789a89cb32c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -937,9 +937,6 @@ config STRICT_MODULE_RWX and non-text memory will be made non-executable. This provides protection against certain security exploits (e.g. writing to text) -config ARCH_WANT_RELAX_ORDER - bool - config ARCH_HAS_REFCOUNT bool help diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 0be3828752e5..4e83f950713e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -44,7 +44,6 @@ config SPARC select ARCH_HAS_SG_CHAIN select CPU_NO_EFFICIENT_FFS select LOCKDEP_SMALL if LOCKDEP - select ARCH_WANT_RELAX_ORDER config SPARC32 def_bool !64BIT diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 2c19070d2a0b..e8c1788aed1f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -366,7 +366,7 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) } IXGBE_WRITE_FLUSH(hw); -#ifndef CONFIG_ARCH_WANT_RELAX_ORDER +#ifndef CONFIG_SPARC /* Disable relaxed ordering */ for (i = 0; i < hw->mac.max_tx_queues; i++) { u32 regval; From 5e0fac63a694918870af9d6eaf716af19e7f5652 Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Fri, 18 Aug 2017 14:21:05 +0800 Subject: [PATCH 093/217] net: ixgbe: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag The ixgbe driver use the compile check to determine if it can send TLPs to Root Port with the Relaxed Ordering Attribute set, this is too inconvenient, now the new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added to the kernel and we could check the bit4 in the PCIe Device Control register to determine whether we should use the Relaxed Ordering Attributes or not, so use this new way in the ixgbe driver. Signed-off-by: Ding Tianhong Acked-by: Emil Tantilov Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../net/ethernet/intel/ixgbe/ixgbe_82598.c | 22 ------------------- .../net/ethernet/intel/ixgbe/ixgbe_common.c | 19 ---------------- 2 files changed, 41 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c index 523f9d05a810..8a32eb7d47b9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c @@ -175,31 +175,9 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw) **/ static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) { -#ifndef CONFIG_SPARC - u32 regval; - u32 i; -#endif s32 ret_val; ret_val = ixgbe_start_hw_generic(hw); - -#ifndef CONFIG_SPARC - /* Disable relaxed ordering */ - for (i = 0; ((i < hw->mac.max_tx_queues) && - (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) { - regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i)); - regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; - IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval); - } - - for (i = 0; ((i < hw->mac.max_rx_queues) && - (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) { - regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i)); - regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN | - IXGBE_DCA_RXCTRL_HEAD_WRO_EN); - IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval); - } -#endif if (ret_val) return ret_val; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index e8c1788aed1f..6e6ab6f6875e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -366,25 +366,6 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) } IXGBE_WRITE_FLUSH(hw); -#ifndef CONFIG_SPARC - /* Disable relaxed ordering */ - for (i = 0; i < hw->mac.max_tx_queues; i++) { - u32 regval; - - regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i)); - regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; - IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval); - } - - for (i = 0; i < hw->mac.max_rx_queues; i++) { - u32 regval; - - regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i)); - regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN | - IXGBE_DCA_RXCTRL_HEAD_WRO_EN); - IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval); - } -#endif return 0; } From 8e679021c5b9465ac5b0d7efd26baab9b10a2dbd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Sep 2017 10:32:48 -0700 Subject: [PATCH 094/217] ixgbe: incorrect XDP ring accounting in ethtool tx_frame param Changing the TX ring parameters with an XDP program attached may cause the XDP queues to be cleared and the TX rings to be incorrectly configured. Fix by doing correct ring accounting in setup call. Fixes: 33fdc82f0883 ("ixgbe: add support for XDP_TX action") Signed-off-by: John Fastabend Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 72c565712a5f..c3e7a8191128 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -1048,7 +1048,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev, { struct ixgbe_adapter *adapter = netdev_priv(netdev); struct ixgbe_ring *temp_ring; - int i, err = 0; + int i, j, err = 0; u32 new_rx_count, new_tx_count; if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending)) @@ -1085,8 +1085,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev, } /* allocate temporary buffer to store rings in */ - i = max_t(int, adapter->num_tx_queues, adapter->num_rx_queues); - i = max_t(int, i, adapter->num_xdp_queues); + i = max_t(int, adapter->num_tx_queues + adapter->num_xdp_queues, + adapter->num_rx_queues); temp_ring = vmalloc(i * sizeof(struct ixgbe_ring)); if (!temp_ring) { @@ -1118,8 +1118,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev, } } - for (i = 0; i < adapter->num_xdp_queues; i++) { - memcpy(&temp_ring[i], adapter->xdp_ring[i], + for (j = 0; j < adapter->num_xdp_queues; j++, i++) { + memcpy(&temp_ring[i], adapter->xdp_ring[j], sizeof(struct ixgbe_ring)); temp_ring[i].count = new_tx_count; @@ -1139,10 +1139,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev, memcpy(adapter->tx_ring[i], &temp_ring[i], sizeof(struct ixgbe_ring)); } - for (i = 0; i < adapter->num_xdp_queues; i++) { - ixgbe_free_tx_resources(adapter->xdp_ring[i]); + for (j = 0; j < adapter->num_xdp_queues; j++, i++) { + ixgbe_free_tx_resources(adapter->xdp_ring[j]); - memcpy(adapter->xdp_ring[i], &temp_ring[i], + memcpy(adapter->xdp_ring[j], &temp_ring[i], sizeof(struct ixgbe_ring)); } From f7974880cf869ddbd0ba9a8e2ab11dff4a667f96 Mon Sep 17 00:00:00 2001 From: John Einar Reitan Date: Mon, 9 Oct 2017 15:49:36 +0200 Subject: [PATCH 095/217] sync_file: Return consistent status in SYNC_IOC_FILE_INFO sync_file_ioctl_fence_info has a race between filling the status of the underlying fences and the overall status of the sync_file. If fence transitions in the time frame between its sync_fill_fence_info and the later dma_fence_is_signaled for the sync_file, the returned information is inconsistent showing non-signaled underlying fences but an overall signaled state. This patch changes sync_file_ioctl_fence_info to track what has been encoded and using that as the overall sync_file status. Tested-by: Vamsidhar Reddy Gaddam Signed-off-by: John Einar Reitan Cc: Sumit Semwal Cc: Gustavo Padovan Cc: dri-devel@lists.freedesktop.org Reviewed-by: Chris Wilson Signed-off-by: Gustavo Padovan Link: https://patchwork.freedesktop.org/patch/msgid/20171009134936.27219-1-john.reitan@arm.com --- drivers/dma-buf/sync_file.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c index 66fb40d0ebdb..03830634e141 100644 --- a/drivers/dma-buf/sync_file.c +++ b/drivers/dma-buf/sync_file.c @@ -383,7 +383,7 @@ static long sync_file_ioctl_merge(struct sync_file *sync_file, return err; } -static void sync_fill_fence_info(struct dma_fence *fence, +static int sync_fill_fence_info(struct dma_fence *fence, struct sync_fence_info *info) { strlcpy(info->obj_name, fence->ops->get_timeline_name(fence), @@ -399,6 +399,8 @@ static void sync_fill_fence_info(struct dma_fence *fence, test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags) ? ktime_to_ns(fence->timestamp) : ktime_set(0, 0); + + return info->status; } static long sync_file_ioctl_fence_info(struct sync_file *sync_file, @@ -424,8 +426,12 @@ static long sync_file_ioctl_fence_info(struct sync_file *sync_file, * sync_fence_info and return the actual number of fences on * info->num_fences. */ - if (!info.num_fences) + if (!info.num_fences) { + info.status = dma_fence_is_signaled(sync_file->fence); goto no_fences; + } else { + info.status = 1; + } if (info.num_fences < num_fences) return -EINVAL; @@ -435,8 +441,10 @@ static long sync_file_ioctl_fence_info(struct sync_file *sync_file, if (!fence_info) return -ENOMEM; - for (i = 0; i < num_fences; i++) - sync_fill_fence_info(fences[i], &fence_info[i]); + for (i = 0; i < num_fences; i++) { + int status = sync_fill_fence_info(fences[i], &fence_info[i]); + info.status = info.status <= 0 ? info.status : status; + } if (copy_to_user(u64_to_user_ptr(info.sync_fence_info), fence_info, size)) { @@ -446,7 +454,6 @@ static long sync_file_ioctl_fence_info(struct sync_file *sync_file, no_fences: sync_file_get_name(sync_file, info.name, sizeof(info.name)); - info.status = dma_fence_is_signaled(sync_file->fence); info.num_fences = num_fences; if (copy_to_user((void __user *)arg, &info, sizeof(info))) From 62cf27e52b8c9a39066172ca6b6134cb5eaa9450 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 Oct 2017 08:39:43 +0200 Subject: [PATCH 096/217] ipv6: Fix traffic triggered IPsec connections. A recent patch removed the dst_free() on the allocated dst_entry in ipv6_blackhole_route(). The dst_free() marked the dst_entry as dead and added it to the gc list. I.e. it was setup for a one time usage. As a result we may now have a blackhole route cached at a socket on some IPsec scenarios. This makes the connection unusable. Fix this by marking the dst_entry directly at allocation time as 'dead', so it is used only once. Fixes: 587fea741134 ("ipv6: mark DST_NOGC and remove the operation of dst_free()") Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..a96d5b385d8f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1325,7 +1325,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); From 6c0e7284d89995877740d8a26c3e99a937312a3c Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 Oct 2017 08:43:55 +0200 Subject: [PATCH 097/217] ipv4: Fix traffic triggered IPsec connections. A recent patch removed the dst_free() on the allocated dst_entry in ipv4_blackhole_route(). The dst_free() marked the dst_entry as dead and added it to the gc list. I.e. it was setup for a one time usage. As a result we may now have a blackhole route cached at a socket on some IPsec scenarios. This makes the connection unusable. Fix this by marking the dst_entry directly at allocation time as 'dead', so it is used only once. Fixes: b838d5e1c5b6 ("ipv4: mark DST_NOGC and remove the operation of dst_free()") Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6fde5d45f1..3d9f1c2f81c5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2513,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; From 41c87425a1ac9b633e0fcc78eb1f19640c8fb5a0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 9 Oct 2017 14:14:51 +0200 Subject: [PATCH 098/217] netlink: do not set cb_running if dump's start() errs It turns out that multiple places can call netlink_dump(), which means it's still possible to dereference partially initialized values in dump() that were the result of a faulty returned start(). This fixes the issue by calling start() _before_ setting cb_running to true, so that there's no chance at all of hitting the dump() function through any indirect paths. It also moves the call to start() to be when the mutex is held. This has the nice side effect of serializing invocations to start(), which is likely desirable anyway. It also prevents any possible other races that might come out of this logic. In testing this with several different pieces of tricky code to trigger these issues, this commit fixes all avenues that I'm aware of. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 94c11cf0459d..f34750691c5c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + if (cb->start) { + ret = cb->start(cb); + if (ret) + goto error_unlock; + } + nlk->cb_running = true; mutex_unlock(nlk->cb_mutex); - ret = 0; - if (cb->start) - ret = cb->start(cb); - - if (!ret) - ret = netlink_dump(sk); + ret = netlink_dump(sk); sock_put(sk); From 996b44fcef8f216ea0b6b6e74468c5a77b5e341f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 9 Oct 2017 14:52:10 +0200 Subject: [PATCH 099/217] udp: fix bcast packet reception The commit bc044e8db796 ("udp: perform source validation for mcast early demux") does not take into account that broadcast packets lands in the same code path and they need different checks for the source address - notably, zero source address are valid for bcast and invalid for mcast. As a result, 2nd and later broadcast packets with 0 source address landing to the same socket are dropped. This breaks dhcp servers. Since we don't have stringent performance requirements for ingress broadcast traffic, fix it by disabling UDP early demux such traffic. Reported-by: Hannes Frederic Sowa Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5676237d2b0f..e45177ceb0ee 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2240,20 +2240,16 @@ int udp_v4_early_demux(struct sk_buff *skb) iph = ip_hdr(skb); uh = udp_hdr(skb); - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) { + if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; - /* we are supposed to accept bcast packets */ - if (skb->pkt_type == PACKET_MULTICAST) { - ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, - iph->protocol); - if (!ours) - return 0; - } + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, From d7ba25bd9ef802ff02414e9105f4222d1795f27a Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Wed, 4 Oct 2017 09:48:26 -0700 Subject: [PATCH 100/217] drm/i915/edp: Get the Panel Power Off timestamp after panel is off Kernel stores the time in jiffies at which the eDP panel is turned off. This should be obtained after the panel is off (after the wait_panel_off). When we next attempt to turn the panel on, we use the difference between the timestamp at which we want to turn the panel on and timestamp at which panel was turned off to ensure that this is equal to panel power cycle delay and if not we wait for the remaining time. Not waiting for the panel power cycle delay can cause the panel to not turn on giving rise to AUX timeouts for the attempted AUX transactions. v2: * Separate lines for bugzilla (Jani Nikula) * Suggested by tag (Daniel Vetter) Cc: Daniel Vetter Cc: Jani Nikula Cc: stable@vger.kernel.org Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101518 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101144 Suggested-by: Daniel Vetter Signed-off-by: Manasi Navare Reviewed-by: Daniel Vetter Reviewed-by: Jani Nikula Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1507135706-17147-1-git-send-email-manasi.d.navare@intel.com (cherry picked from commit cbacf02e7796fea02e5c6e46c90ed7cbe9e6f2c0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 64134947c0aa..c0f8d7e66049 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -2307,8 +2307,8 @@ static void edp_panel_off(struct intel_dp *intel_dp) I915_WRITE(pp_ctrl_reg, pp); POSTING_READ(pp_ctrl_reg); - intel_dp->panel_power_off_time = ktime_get_boottime(); wait_panel_off(intel_dp); + intel_dp->panel_power_off_time = ktime_get_boottime(); /* We got a reference when we enabled the VDD. */ intel_display_power_put(dev_priv, intel_dp->aux_power_domain); From 7313f5a93d2017f789909a7a727a6cab48ea6d20 Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Tue, 3 Oct 2017 16:37:25 -0700 Subject: [PATCH 101/217] drm/i915/edp: Increase the T12 delay quirk to 1300ms For this specific PCI device, the eDP panel requires a higher panel power cycle delay of 1300ms where the minimum spec requirement of panel power cycle delay is 500ms. This fix in combination with correct timestamp at which we get the panel power off time fixes the dP AUX CH timeouts seen on various IGT tests. Fixes: c99a259b4b5192ba ("drm/i915/edp: Add a T12 panel delay quirk to fix DP AUX CH timeouts") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101144 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101518 Cc: Daniel Vetter Cc: Jani Nikula Cc: Ville Syrjala Signed-off-by: Manasi Navare Acked-by: Daniel Vetter Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1507073845-13420-2-git-send-email-manasi.d.navare@intel.com (cherry picked from commit c02b8fb4073d1b9aa5af909a91b51056b819d946) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index c0f8d7e66049..203198659ab2 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -5273,7 +5273,7 @@ intel_dp_init_panel_power_sequencer(struct drm_device *dev, * seems sufficient to avoid this problem. */ if (dev_priv->quirks & QUIRK_INCREASE_T12_DELAY) { - vbt.t11_t12 = max_t(u16, vbt.t11_t12, 900 * 10); + vbt.t11_t12 = max_t(u16, vbt.t11_t12, 1300 * 10); DRM_DEBUG_KMS("Increasing T12 panel delay as per the quirk to %d\n", vbt.t11_t12); } From d6a55c63e6adcb58957bbdce2d390088970273da Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 5 Oct 2017 16:15:20 +0200 Subject: [PATCH 102/217] drm/i915: Use crtc_state_is_legacy_gamma in intel_color_check crtc_state_is_legacy_gamma also checks for CTM, which was missing from intel_color_check. By using the same condition for commit and check we reduce the chance of mismatches. This was spotted by KASAN while trying to rework kms_color igt test. [ 72.008660] ================================================================== [ 72.009326] BUG: KASAN: slab-out-of-bounds in bdw_load_gamma_lut.isra.3+0x15c/0x360 [i915] [ 72.009519] Read of size 2 at addr ffff880220216e50 by task kms_color/1158 [ 72.009900] CPU: 2 PID: 1158 Comm: kms_color Tainted: G U W 4.14.0-rc3-patser+ #5281 [ 72.009921] Hardware name: GIGABYTE GB-BKi3A-7100/MFLP3AP-00, BIOS F1 07/27/2016 [ 72.009941] Call Trace: [ 72.009968] dump_stack+0xc5/0x151 [ 72.009996] ? _atomic_dec_and_lock+0x10f/0x10f [ 72.010024] ? show_regs_print_info+0x3c/0x3c [ 72.010072] print_address_description+0x7f/0x240 [ 72.010108] kasan_report+0x216/0x370 [ 72.010308] ? bdw_load_gamma_lut.isra.3+0x15c/0x360 [i915] [ 72.010349] __asan_load2+0x74/0x80 [ 72.010552] bdw_load_gamma_lut.isra.3+0x15c/0x360 [i915] [ 72.010772] broadwell_load_luts+0x1f0/0x300 [i915] [ 72.010997] intel_color_load_luts+0x36/0x40 [i915] [ 72.011205] intel_begin_crtc_commit+0xa1/0x310 [i915] [ 72.011283] drm_atomic_helper_commit_planes_on_crtc+0xa6/0x320 [drm_kms_helper] [ 72.011316] ? wait_for_completion_io+0x460/0x460 [ 72.011524] intel_update_crtc+0xe3/0x100 [i915] [ 72.011720] skl_update_crtcs+0x360/0x3f0 [i915] [ 72.011945] ? intel_update_crtcs+0xf0/0xf0 [i915] [ 72.012010] ? drm_atomic_helper_wait_for_dependencies+0x3d9/0x400 [drm_kms_helper] [ 72.012231] intel_atomic_commit_tail+0x8db/0x1500 [i915] [ 72.012273] ? __lock_is_held+0x9c/0xc0 [ 72.012494] ? skl_update_crtcs+0x3f0/0x3f0 [i915] [ 72.012518] ? find_next_bit+0xb/0x10 [ 72.012544] ? cpumask_next+0x1a/0x20 [ 72.012745] ? i915_sw_fence_complete+0x9d/0xe0 [i915] [ 72.012938] ? __i915_sw_fence_complete+0x5d0/0x5d0 [i915] [ 72.013176] intel_atomic_commit+0x528/0x570 [i915] [ 72.013280] ? drm_atomic_get_property+0xc00/0xc00 [drm] [ 72.013466] ? intel_atomic_commit_tail+0x1500/0x1500 [i915] [ 72.013496] ? kmem_cache_alloc_trace+0x266/0x280 [ 72.013714] ? intel_atomic_commit_tail+0x1500/0x1500 [i915] [ 72.013812] drm_atomic_commit+0x77/0x80 [drm] [ 72.013911] set_property_atomic+0x14a/0x210 [drm] [ 72.014015] ? drm_object_property_get_value+0x70/0x70 [drm] [ 72.014080] ? mutex_unlock+0xd/0x10 [ 72.014292] ? intel_atomic_commit_tail+0x1500/0x1500 [i915] [ 72.014379] drm_mode_obj_set_property_ioctl+0x1cf/0x310 [drm] [ 72.014481] ? drm_mode_obj_find_prop_id+0xa0/0xa0 [drm] [ 72.014510] ? lock_release+0x6c0/0x6c0 [ 72.014602] ? drm_is_current_master+0x46/0x60 [drm] [ 72.014706] drm_ioctl_kernel+0x148/0x1d0 [drm] [ 72.014799] ? drm_mode_obj_find_prop_id+0xa0/0xa0 [drm] [ 72.014898] ? drm_ioctl_permit+0x100/0x100 [drm] [ 72.014936] ? kasan_check_write+0x14/0x20 [ 72.015039] drm_ioctl+0x441/0x660 [drm] [ 72.015129] ? drm_mode_obj_find_prop_id+0xa0/0xa0 [drm] [ 72.015235] ? drm_getstats+0x20/0x20 [drm] [ 72.015287] ? ___might_sleep+0x159/0x340 [ 72.015311] ? find_held_lock+0xcf/0xf0 [ 72.015341] ? __schedule_bug+0x110/0x110 [ 72.015405] do_vfs_ioctl+0xa88/0xb10 [ 72.015449] ? ioctl_preallocate+0x1a0/0x1a0 [ 72.015487] ? selinux_capable+0x20/0x20 [ 72.015525] ? rcu_dynticks_momentary_idle+0x40/0x40 [ 72.015607] SyS_ioctl+0x4e/0x80 [ 72.015647] entry_SYSCALL_64_fastpath+0x18/0xad [ 72.015670] RIP: 0033:0x7ff74a3d04d7 [ 72.015691] RSP: 002b:00007ffc594bec08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 72.015734] RAX: ffffffffffffffda RBX: ffffffff8718f54a RCX: 00007ff74a3d04d7 [ 72.015756] RDX: 00007ffc594bec40 RSI: 00000000c01864ba RDI: 0000000000000003 [ 72.015777] RBP: ffff880211c0ff98 R08: 0000000000000086 R09: 0000000000000000 [ 72.015799] R10: 00007ff74a691b58 R11: 0000000000000246 R12: 0000000000000355 [ 72.015821] R13: 00000000ff00eb00 R14: 0000000000000a00 R15: 00007ff746082000 [ 72.015857] ? trace_hardirqs_off_caller+0xfa/0x110 Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20171005141520.23990-1-maarten.lankhorst@linux.intel.com [mlankhorst: s/crtc_state_is_legacy/&_gamma/ (danvet)] Reviewed-by: Daniel Vetter Fixes: 82cf435b3134 ("drm/i915: Implement color management on bdw/skl/bxt/kbl") Cc: # v4.7+ (cherry picked from commit 0c3767b28186c8129f2a2cfec06a93dcd6102391) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_color.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c index ff9ecd211abb..b8315bca852b 100644 --- a/drivers/gpu/drm/i915/intel_color.c +++ b/drivers/gpu/drm/i915/intel_color.c @@ -74,7 +74,7 @@ #define I9XX_CSC_COEFF_1_0 \ ((7 << 12) | I9XX_CSC_COEFF_FP(CTM_COEFF_1_0, 8)) -static bool crtc_state_is_legacy(struct drm_crtc_state *state) +static bool crtc_state_is_legacy_gamma(struct drm_crtc_state *state) { return !state->degamma_lut && !state->ctm && @@ -288,7 +288,7 @@ static void cherryview_load_csc_matrix(struct drm_crtc_state *state) } mode = (state->ctm ? CGM_PIPE_MODE_CSC : 0); - if (!crtc_state_is_legacy(state)) { + if (!crtc_state_is_legacy_gamma(state)) { mode |= (state->degamma_lut ? CGM_PIPE_MODE_DEGAMMA : 0) | (state->gamma_lut ? CGM_PIPE_MODE_GAMMA : 0); } @@ -469,7 +469,7 @@ static void broadwell_load_luts(struct drm_crtc_state *state) struct intel_crtc_state *intel_state = to_intel_crtc_state(state); enum pipe pipe = to_intel_crtc(state->crtc)->pipe; - if (crtc_state_is_legacy(state)) { + if (crtc_state_is_legacy_gamma(state)) { haswell_load_luts(state); return; } @@ -529,7 +529,7 @@ static void glk_load_luts(struct drm_crtc_state *state) glk_load_degamma_lut(state); - if (crtc_state_is_legacy(state)) { + if (crtc_state_is_legacy_gamma(state)) { haswell_load_luts(state); return; } @@ -551,7 +551,7 @@ static void cherryview_load_luts(struct drm_crtc_state *state) uint32_t i, lut_size; uint32_t word0, word1; - if (crtc_state_is_legacy(state)) { + if (crtc_state_is_legacy_gamma(state)) { /* Turn off degamma/gamma on CGM block. */ I915_WRITE(CGM_PIPE_MODE(pipe), (state->ctm ? CGM_PIPE_MODE_CSC : 0)); @@ -632,12 +632,10 @@ int intel_color_check(struct drm_crtc *crtc, return 0; /* - * We also allow no degamma lut and a gamma lut at the legacy + * We also allow no degamma lut/ctm and a gamma lut at the legacy * size (256 entries). */ - if (!crtc_state->degamma_lut && - crtc_state->gamma_lut && - crtc_state->gamma_lut->length == LEGACY_LUT_LENGTH) + if (crtc_state_is_legacy_gamma(crtc_state)) return 0; return -EINVAL; From de3ded0ae677749b4fc9f59d15b26b9f077340ac Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Oct 2017 11:04:16 +0100 Subject: [PATCH 103/217] drm/i915: Silence compiler warning for hsw_power_well_enable() Not all compilers are able to determine that pg is guarded by wait_fuses and so may think that pg is used uninitialized. Reported-by: Geert Uytterhoeven Fixes: b2891eb2531e ("drm/i915/hsw+: Add has_fuses power well attribute") Signed-off-by: Chris Wilson Cc: Imre Deak Cc: Arkadiusz Hiler Link: https://patchwork.freedesktop.org/patch/msgid/20171002100416.25865-1-chris@chris-wilson.co.uk Reviewed-by: Imre Deak (cherry picked from commit 320671f94ada80ff036cc9d5dcd730ba4f3e0f1a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_runtime_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index b3a087cb0860..49577eba8e7e 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -368,7 +368,7 @@ static void hsw_power_well_enable(struct drm_i915_private *dev_priv, { enum i915_power_well_id id = power_well->id; bool wait_fuses = power_well->hsw.has_fuses; - enum skl_power_gate pg; + enum skl_power_gate uninitialized_var(pg); u32 val; if (wait_fuses) { From b85577b72837ee8d9213e93d2c8b67ef78a47803 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 6 Oct 2017 12:56:17 +0100 Subject: [PATCH 104/217] drm/i915: Order two completing nop_submit_request If two nop's (requests in-flight following a wedged device) complete at the same time, the global_seqno value written to the HWSP is undefined as the two threads are not serialized. v2: Use irqsafe spinlock. We expect the callback may be called from inside another irq spinlock, so we can't unconditionally restore irqs. Fixes: ce1135c7de64 ("drm/i915: Complete requests in nop_submit_request") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20171006115617.18432-1-chris@chris-wilson.co.uk (cherry picked from commit 8d550824c6f52506754f11cb6be51aa153cc580d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 19404c96eeb1..af289d35b77a 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3013,10 +3013,15 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv) static void nop_submit_request(struct drm_i915_gem_request *request) { + unsigned long flags; + GEM_BUG_ON(!i915_terminally_wedged(&request->i915->gpu_error)); dma_fence_set_error(&request->fence, -EIO); - i915_gem_request_submit(request); + + spin_lock_irqsave(&request->engine->timeline->lock, flags); + __i915_gem_request_submit(request); intel_engine_init_global_seqno(request->engine, request->global_seqno); + spin_unlock_irqrestore(&request->engine->timeline->lock, flags); } static void engine_set_wedged(struct intel_engine_cs *engine) From 7b50f7b24cd6c98541f1af53bddc5b6e861ee8c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 1 Apr 2016 18:37:25 +0300 Subject: [PATCH 105/217] drm/i915: Read timings from the correct transcoder in intel_crtc_mode_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit intel_crtc->config->cpu_transcoder isn't yet filled out when intel_crtc_mode_get() gets called during output probing, so we should not use it there. Instead intel_crtc_mode_get() figures out the correct transcoder on its own, and that's what we should use. If the BIOS boots LVDS on pipe B, intel_crtc_mode_get() would actually end up reading the timings from pipe A instead (since PIPE_A==0), which clearly isn't what we want. It looks to me like this may have been broken by commit eccb140bca67 ("drm/i915: hw state readout&check support for cpu_transcoder") as that one removed the early initialization of cpu_transcoder from intel_crtc_init(). Cc: stable@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Rob Kramer Cc: Daniel Vetter Reported-by: Rob Kramer Fixes: eccb140bca67 ("drm/i915: hw state readout&check support for cpu_transcoder") References: https://lists.freedesktop.org/archives/dri-devel/2016-April/104142.html Signed-off-by: Ville Syrjälä Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/1459525046-19425-1-git-send-email-ville.syrjala@linux.intel.com (cherry picked from commit e30a154b5262b967b133b06ac40777e651045898) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_display.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 64f7b51ed97c..5c7828c52d12 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -10245,13 +10245,10 @@ struct drm_display_mode *intel_crtc_mode_get(struct drm_device *dev, { struct drm_i915_private *dev_priv = to_i915(dev); struct intel_crtc *intel_crtc = to_intel_crtc(crtc); - enum transcoder cpu_transcoder = intel_crtc->config->cpu_transcoder; + enum transcoder cpu_transcoder; struct drm_display_mode *mode; struct intel_crtc_state *pipe_config; - int htot = I915_READ(HTOTAL(cpu_transcoder)); - int hsync = I915_READ(HSYNC(cpu_transcoder)); - int vtot = I915_READ(VTOTAL(cpu_transcoder)); - int vsync = I915_READ(VSYNC(cpu_transcoder)); + u32 htot, hsync, vtot, vsync; enum pipe pipe = intel_crtc->pipe; mode = kzalloc(sizeof(*mode), GFP_KERNEL); @@ -10279,6 +10276,13 @@ struct drm_display_mode *intel_crtc_mode_get(struct drm_device *dev, i9xx_crtc_clock_get(intel_crtc, pipe_config); mode->clock = pipe_config->port_clock / pipe_config->pixel_multiplier; + + cpu_transcoder = pipe_config->cpu_transcoder; + htot = I915_READ(HTOTAL(cpu_transcoder)); + hsync = I915_READ(HSYNC(cpu_transcoder)); + vtot = I915_READ(VTOTAL(cpu_transcoder)); + vsync = I915_READ(VSYNC(cpu_transcoder)); + mode->hdisplay = (htot & 0xffff) + 1; mode->htotal = ((htot & 0xffff0000) >> 16) + 1; mode->hsync_start = (hsync & 0xffff) + 1; From 1a2ace56cea2d752b212e198c5e70ff0c0f39b59 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 9 Oct 2017 11:44:53 -0500 Subject: [PATCH 106/217] net: thunderx: mark expected switch fall-throughs in nicvf_main() In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Cc: Sunil Goutham Cc: Robert Richter Cc: linux-arm-kernel@lists.infradead.org Cc: netdev@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 49b80da51ba7..805ab45e9b5a 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -565,8 +565,10 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, return true; default: bpf_warn_invalid_xdp_action(action); + /* fall through */ case XDP_ABORTED: trace_xdp_exception(nic->netdev, prog, action); + /* fall through */ case XDP_DROP: /* Check if it's a recycled page, if not * unmap the DMA mapping. From 66ec11919a0f96e936bb731fdbc2851316077d26 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 6 Oct 2017 19:38:22 +0100 Subject: [PATCH 107/217] perf pmu: Unbreak perf record for arm/arm64 with events with explicit PMU Currently, perf record is broken on arm/arm64 systems when the PMU is specified explicitly as part of the event, e.g. $ ./perf record -e armv8_cortex_a53/cpu_cycles/u true In such cases, perf record fails to open events unless perf_event_paranoid is set to -1, even if the PMU in question supports mode exclusion. Further, even when perf_event_paranoid is toggled, no samples are recorded. This is an unintended side effect of commit: e3ba76deef23064f ("perf tools: Force uncore events to system wide monitoring) ... which assumes that if a PMU has an associated cpu_map, it is an uncore PMU, and forces events for such PMUs to be system-wide. This is not true for arm/arm64 systems, which can have heterogeneous CPUs. To account for this, multiple CPU PMUs are exposed, each with a "cpus" field under sysfs, which the perf tool parses into a cpu_map. ARM PMUs do not have a "cpumask" file, and only have a "cpus" file. For the gory details as to why, see commit: 7e3fcffe95544010 ("perf pmu: Support alternative sysfs cpumask") Given all of this, we can instead identify uncore PMUs by explicitly checking for a "cpumask" file, and restore arm/arm64 PMU support back to a working state. This patch does so, adding a new perf_pmu::is_uncore field, and splitting the existing cpumask parsing so that it can be reused. Signed-off-by: Mark Rutland Tested-by Will Deacon Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: 4.12+ Fixes: e3ba76deef23064f ("perf tools: Force uncore events to system wide monitoring) Link: http://lkml.kernel.org/r/1507315102-5942-1-git-send-email-mark.rutland@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 9 +++-- tools/perf/util/pmu.c | 70 +++++++++++++++++++++++----------- tools/perf/util/pmu.h | 1 + 3 files changed, 54 insertions(+), 26 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f6257fb4f08c..39b15968eab1 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -309,10 +309,11 @@ static char *get_config_name(struct list_head *head_terms) static struct perf_evsel * __add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, - char *name, struct cpu_map *cpus, + char *name, struct perf_pmu *pmu, struct list_head *config_terms, bool auto_merge_stats) { struct perf_evsel *evsel; + struct cpu_map *cpus = pmu ? pmu->cpus : NULL; event_attr_init(attr); @@ -323,7 +324,7 @@ __add_event(struct list_head *list, int *idx, (*idx)++; evsel->cpus = cpu_map__get(cpus); evsel->own_cpus = cpu_map__get(cpus); - evsel->system_wide = !!cpus; + evsel->system_wide = pmu ? pmu->is_uncore : false; evsel->auto_merge_stats = auto_merge_stats; if (name) @@ -1233,7 +1234,7 @@ static int __parse_events_add_pmu(struct parse_events_state *parse_state, if (!head_config) { attr.type = pmu->type; - evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL, auto_merge_stats); + evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu, NULL, auto_merge_stats); return evsel ? 0 : -ENOMEM; } @@ -1254,7 +1255,7 @@ static int __parse_events_add_pmu(struct parse_events_state *parse_state, return -EINVAL; evsel = __add_event(list, &parse_state->idx, &attr, - get_config_name(head_config), pmu->cpus, + get_config_name(head_config), pmu, &config_terms, auto_merge_stats); if (evsel) { evsel->unit = info.unit; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ac16a9db1fb5..1c4d7b4e4fb5 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -470,31 +470,10 @@ static void pmu_read_sysfs(void) closedir(dir); } -static struct cpu_map *pmu_cpumask(const char *name) +static struct cpu_map *__pmu_cpumask(const char *path) { - struct stat st; - char path[PATH_MAX]; FILE *file; struct cpu_map *cpus; - const char *sysfs = sysfs__mountpoint(); - const char *templates[] = { - "%s/bus/event_source/devices/%s/cpumask", - "%s/bus/event_source/devices/%s/cpus", - NULL - }; - const char **template; - - if (!sysfs) - return NULL; - - for (template = templates; *template; template++) { - snprintf(path, PATH_MAX, *template, sysfs, name); - if (stat(path, &st) == 0) - break; - } - - if (!*template) - return NULL; file = fopen(path, "r"); if (!file) @@ -505,6 +484,51 @@ static struct cpu_map *pmu_cpumask(const char *name) return cpus; } +/* + * Uncore PMUs have a "cpumask" file under sysfs. CPU PMUs (e.g. on arm/arm64) + * may have a "cpus" file. + */ +#define CPUS_TEMPLATE_UNCORE "%s/bus/event_source/devices/%s/cpumask" +#define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" + +static struct cpu_map *pmu_cpumask(const char *name) +{ + char path[PATH_MAX]; + struct cpu_map *cpus; + const char *sysfs = sysfs__mountpoint(); + const char *templates[] = { + CPUS_TEMPLATE_UNCORE, + CPUS_TEMPLATE_CPU, + NULL + }; + const char **template; + + if (!sysfs) + return NULL; + + for (template = templates; *template; template++) { + snprintf(path, PATH_MAX, *template, sysfs, name); + cpus = __pmu_cpumask(path); + if (cpus) + return cpus; + } + + return NULL; +} + +static bool pmu_is_uncore(const char *name) +{ + char path[PATH_MAX]; + struct cpu_map *cpus; + const char *sysfs = sysfs__mountpoint(); + + snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name); + cpus = __pmu_cpumask(path); + cpu_map__put(cpus); + + return !!cpus; +} + /* * Return the CPU id as a raw string. * @@ -617,6 +641,8 @@ static struct perf_pmu *pmu_lookup(const char *name) pmu->cpus = pmu_cpumask(name); + pmu->is_uncore = pmu_is_uncore(name); + INIT_LIST_HEAD(&pmu->format); INIT_LIST_HEAD(&pmu->aliases); list_splice(&format, &pmu->format); diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 389e9729331f..fe0de0502ce2 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -22,6 +22,7 @@ struct perf_pmu { char *name; __u32 type; bool selectable; + bool is_uncore; struct perf_event_attr *default_config; struct cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ From aa7b4e02b328f0589b6133e72aafb1289f614a79 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Oct 2017 15:55:45 -0300 Subject: [PATCH 108/217] tools include uapi bpf.h: Sync kernel ABI header with tooling header Silences the checker: Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h' The 90caccdd8cc0 ("bpf: fix bpf_tail_call() x64 JIT") cset only updated a comment in uapi/bpf.h. Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: David Ahern Cc: David S. Miller Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-rwx2cqbf0x1lwa1krsr6e6hd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 43ab5c402f98..f90860d1f897 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -312,7 +312,7 @@ union bpf_attr { * jump into another BPF program * @ctx: context pointer passed to next program * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run + * @index: 32-bit index inside array that selects specific program to run * Return: 0 on success or negative error * * int bpf_clone_redirect(skb, ifindex, flags) From fdfbad3256918fc5736d68384331d2dbf45ccbd6 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Mon, 9 Oct 2017 14:05:12 +0200 Subject: [PATCH 109/217] cdc_ether: flag the u-blox TOBY-L2 and SARA-U2 as wwan The u-blox TOBY-L2 is a LTE Cat 4 module with HSPA+ and 2G fallback. This module allows switching to different USB profiles with the 'AT+UUSBCONF' command, and provides a ECM network interface when the 'AT+UUSBCONF=2' profile is selected. The u-blox SARA-U2 is a HSPA module with 2G fallback. The default USB configuration includes a ECM network interface. Both these modules are controlled via AT commands through one of the TTYs exposed. Connecting these modules may be done just by activating the desired PDP context with 'AT+CGACT=1,' and then running DHCP on the ECM interface. Signed-off-by: Aleksander Morgado Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 29c7e2ec0dcb..52ea80bcd639 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -560,6 +560,7 @@ static const struct driver_info wwan_info = { #define NVIDIA_VENDOR_ID 0x0955 #define HP_VENDOR_ID 0x03f0 #define MICROSOFT_VENDOR_ID 0x045e +#define UBLOX_VENDOR_ID 0x1546 static const struct usb_device_id products[] = { /* BLACKLIST !! @@ -868,6 +869,18 @@ static const struct usb_device_id products[] = { USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&zte_cdc_info, +}, { + /* U-blox TOBY-L2 */ + USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, +}, { + /* U-blox SARA-U2 */ + USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), From 96ca579a1ecc943b75beba58bebb0356f6cc4b51 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 11:36:52 -0700 Subject: [PATCH 110/217] waitid(): Add missing access_ok() checks Adds missing access_ok() checks. CVE-2017-5123 Reported-by: Chris Salls Signed-off-by: Kees Cook Acked-by: Al Viro Fixes: 4c48abe91be0 ("waitid(): switch copyout of siginfo to unsafe_put_user()") Cc: stable@kernel.org # 4.13 Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index f2cd53e92147..cf28528842bc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; + if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + goto Efault; + user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); @@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; + if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + goto Efault; + user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); From 5ab894aee0f171a682bcd90dd5d1930cb53c55dc Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 9 Oct 2017 16:28:37 +0300 Subject: [PATCH 111/217] device property: Track owner device of device property Deletion of subdevice will remove device properties associated to parent when they share the same firmware node after commit 478573c93abd (driver core: Don't leak secondary fwnode on device removal). This was observed with a driver adding subdevice that driver wasn't able to read device properties after rmmod/modprobe cycle. Consider the lifecycle of it: parent device registration ACPI_COMPANION_SET() device_add_properties() pset_copy_set() set_secondary_fwnode(dev, &p->fwnode) device_add() parent probe read device properties ACPI_COMPANION_SET(subdevice, ACPI_COMPANION(parent)) device_add(subdevice) parent remove device_del(subdevice) device_remove_properties() set_secondary_fwnode(dev, NULL); pset_free() Parent device will have its primary firmware node pointing to an ACPI node and secondary firmware node point to device properties. ACPI_COMPANION_SET() call in parent probe will set the subdevice's firmware node to point to the same 'struct fwnode_handle' and the associated secondary firmware node, i.e. the device properties as the parent. When subdevice is deleted in parent remove that will remove those device properties and attempt to read device properties in next parent probe call will fail. Fix this by tracking the owner device of device properties and delete them only when owner device is being deleted. Fixes: 478573c93abd (driver core: Don't leak secondary fwnode on device removal) Cc: 4.9+ # 4.9+ Signed-off-by: Jarkko Nikula Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index d0b65bbe7e15..21fcc13013a5 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -21,6 +21,7 @@ #include struct property_set { + struct device *dev; struct fwnode_handle fwnode; const struct property_entry *properties; }; @@ -891,6 +892,7 @@ static struct property_set *pset_copy_set(const struct property_set *pset) void device_remove_properties(struct device *dev) { struct fwnode_handle *fwnode; + struct property_set *pset; fwnode = dev_fwnode(dev); if (!fwnode) @@ -900,16 +902,16 @@ void device_remove_properties(struct device *dev) * the pset. If there is no real firmware node (ACPI/DT) primary * will hold the pset. */ - if (is_pset_node(fwnode)) { + pset = to_pset_node(fwnode); + if (pset) { set_primary_fwnode(dev, NULL); - pset_free_set(to_pset_node(fwnode)); } else { - fwnode = fwnode->secondary; - if (!IS_ERR(fwnode) && is_pset_node(fwnode)) { + pset = to_pset_node(fwnode->secondary); + if (pset && dev == pset->dev) set_secondary_fwnode(dev, NULL); - pset_free_set(to_pset_node(fwnode)); - } } + if (pset && dev == pset->dev) + pset_free_set(pset); } EXPORT_SYMBOL_GPL(device_remove_properties); @@ -938,6 +940,7 @@ int device_add_properties(struct device *dev, p->fwnode.ops = &pset_fwnode_ops; set_secondary_fwnode(dev, &p->fwnode); + p->dev = dev; return 0; } EXPORT_SYMBOL_GPL(device_add_properties); From e36a82ee4c514a2f4f8fa30c780ad059282f5d64 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Wed, 20 Sep 2017 15:49:51 +0530 Subject: [PATCH 112/217] powerpc/livepatch: Fix livepatch stack access While running stress test with livepatch module loaded, kernel bug was triggered. cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60] 5:mon> t [c0000000eb9d3de0] c0000000eb9d3e30 (unreliable) [c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120 --- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74 [c0000000eb9d4120] 0000000057ac6e9d (unreliable) [d0000000089d9f78] 2e0965747962382e SP (965747962342e09) is in userspace When an interrupt occurs during the livepatch_handler execution, it's possible for the livepatch_stack and/or thread_info to be corrupted. eg: Task A Interrupt Handler ========= ================= livepatch_handler: mr r0, r1 ld r1, TI_livepatch_sp(r12) hardware_interrupt_common: do_IRQ+0x8: mflr r0 <- saved stack pointer is overwritten bl _mcount ... std r27,-40(r1) <- overwrite of thread_info() lis r2, STACK_END_MAGIC@h ori r2, r2, STACK_END_MAGIC@l ld r12, -8(r1) Fix the corruption by using r11 register for livepatch stack manipulation, instead of shuffling task stack and livepatch stack into r1 register. Using r11 register also avoids disabling/enabling irq's while setting up the livepatch stack. Signed-off-by: Kamalesh Babulal Reviewed-by: Naveen N. Rao Reviewed-by: Balbir Singh Signed-off-by: Michael Ellerman --- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 45 +++++++------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index c98e90b4ea7b..b4e2b7165f79 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -181,34 +181,25 @@ _GLOBAL(ftrace_stub) * - we have no stack frame and can not allocate one * - LR points back to the original caller (in A) * - CTR holds the new NIP in C - * - r0 & r12 are free - * - * r0 can't be used as the base register for a DS-form load or store, so - * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. + * - r0, r11 & r12 are free */ livepatch_handler: CURRENT_THREAD_INFO(r12, r1) - /* Save stack pointer into r0 */ - mr r0, r1 - /* Allocate 3 x 8 bytes */ - ld r1, TI_livepatch_sp(r12) - addi r1, r1, 24 - std r1, TI_livepatch_sp(r12) + ld r11, TI_livepatch_sp(r12) + addi r11, r11, 24 + std r11, TI_livepatch_sp(r12) /* Save toc & real LR on livepatch stack */ - std r2, -24(r1) + std r2, -24(r11) mflr r12 - std r12, -16(r1) + std r12, -16(r11) /* Store stack end marker */ lis r12, STACK_END_MAGIC@h ori r12, r12, STACK_END_MAGIC@l - std r12, -8(r1) - - /* Restore real stack pointer */ - mr r1, r0 + std r12, -8(r11) /* Put ctr in r12 for global entry and branch there */ mfctr r12 @@ -216,36 +207,30 @@ livepatch_handler: /* * Now we are returning from the patched function to the original - * caller A. We are free to use r0 and r12, and we can use r2 until we + * caller A. We are free to use r11, r12 and we can use r2 until we * restore it. */ CURRENT_THREAD_INFO(r12, r1) - /* Save stack pointer into r0 */ - mr r0, r1 - - ld r1, TI_livepatch_sp(r12) + ld r11, TI_livepatch_sp(r12) /* Check stack marker hasn't been trashed */ lis r2, STACK_END_MAGIC@h ori r2, r2, STACK_END_MAGIC@l - ld r12, -8(r1) + ld r12, -8(r11) 1: tdne r12, r2 EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 /* Restore LR & toc from livepatch stack */ - ld r12, -16(r1) + ld r12, -16(r11) mtlr r12 - ld r2, -24(r1) + ld r2, -24(r11) /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r0) - subi r1, r1, 24 - std r1, TI_livepatch_sp(r12) - - /* Restore real stack pointer */ - mr r1, r0 + CURRENT_THREAD_INFO(r12, r1) + subi r11, r11, 24 + std r11, TI_livepatch_sp(r12) /* Return to original caller of live patched function */ blr From 8b405d5c5d0996d3d16f70c42744a0500f5b6ec3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Oct 2017 11:13:37 +0200 Subject: [PATCH 113/217] locking/lockdep: Fix stacktrace mess There is some complication between check_prevs_add() and check_prev_add() wrt. saving stack traces. The problem is that we want to be frugal with saving stack traces, since it consumes static resources. We'll only know in check_prev_add() if we need the trace, but we can call into it multiple times. So we want to do on-demand and re-use. A further complication is that check_prev_add() can drop graph_lock and mess with our static resources. In any case, the current state; after commit: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") is that we'll assume the trace contains valid data once check_prev_add() returns '2'. However, as noted by Josh, this is false, check_prev_add() can return '2' before having saved a trace, this then result in the possibility of using uninitialized data. Testing, as reported by Wu, shows a NULL deref. So simplify. Since the graph_lock() thing is a debug path that hasn't really been used in a long while, take it out back and avoid the head-ache. Further initialize the stack_trace to a known 'empty' state; as long as nr_entries == 0, nothing should deref entries. We can then use the 'entries == NULL' test for a valid trace / on-demand saving. Analyzed-by: Josh Poimboeuf Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 50 +++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 44c8d0d17170..e36e652d996f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance, struct stack_trace *trace, int (*save)(struct stack_trace *trace)) { - struct lock_list *entry; - int ret; - struct lock_list this; struct lock_list *uninitialized_var(target_entry); + struct lock_list *entry; + struct lock_list this; + int ret; /* * Prove that the new -> dependency would not @@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(next); this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) + if (unlikely(!ret)) { + if (!trace->entries) { + /* + * If @save fails here, the printing might trigger + * a WARN but because of the !nr_entries it should + * not do bad things. + */ + save(trace); + } return print_circular_bug(&this, target_entry, next, prev, trace); + } else if (unlikely(ret < 0)) return print_bfs_bug(ret); @@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (save && !save(trace)) + if (!trace->entries && !save(trace)) return 0; /* @@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!ret) return 0; - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(KERN_CONT " => "); - print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); - dump_stack(); - if (!graph_lock()) - return 0; - } return 2; } @@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace; - int (*save)(struct stack_trace *trace) = save_trace; + struct stack_trace trace = { + .nr_entries = 0, + .max_entries = 0, + .entries = NULL, + .skip = 0, + }; /* * Debugging checks. @@ -2018,17 +2017,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { int ret = check_prev_add(curr, hlock, next, - distance, &trace, save); + distance, &trace, save_trace); if (!ret) return 0; - /* - * Stop saving stack_trace if save_trace() was - * called at least once: - */ - if (save && ret == 2) - save = NULL; - /* * Stop after the first non-trylock entry, * as non-trylock entries have added their From c7e2f69d3ed2e56de1f5eaaf37c0f5f91d7adb0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Oct 2017 10:50:25 +0200 Subject: [PATCH 114/217] locking/selftest: Avoid false BUG report The work-around for the expected failure is providing another failure :/ Only when CONFIG_PROVE_LOCKING=y do we increment unexpected_testcase_failures, so only then do we need to decrement, otherwise we'll end up with a negative number and that will again trigger a BUG (printout, not crash). Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: d82fed752942 ("locking/lockdep/selftests: Fix mixed read-write ABBA tests") Signed-off-by: Ingo Molnar --- lib/locking-selftest.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index cd0b5c964bd0..2b827b8a1d8c 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -2031,11 +2031,13 @@ void locking_selftest(void) print_testname("mixed read-lock/lock-write ABBA"); pr_cont(" |"); dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK); +#ifdef CONFIG_PROVE_LOCKING /* * Lockdep does indeed fail here, but there's nothing we can do about * that now. Don't kill lockdep for it. */ unexpected_testcase_failures--; +#endif pr_cont(" |"); dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM); From df0062b27ebf473b372914a3e3574d93790e2b72 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 15:20:50 +0100 Subject: [PATCH 115/217] perf/core: Avoid freeing static PMU contexts when PMU is unregistered Since commit: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") ... when a PMU is unregistered then its associated ->pmu_cpu_context is unconditionally freed. Whilst this is fine for dynamically allocated context types (i.e. those registered using perf_invalid_context), this causes a problem for sharing of static contexts such as perf_{sw,hw}_context, which are used by multiple built-in PMUs and effectively have a global lifetime. Whilst testing the ARM SPE driver, which must use perf_sw_context to support per-task AUX tracing, unregistering the driver as a result of a module unload resulted in: Unable to handle kernel NULL pointer dereference at virtual address 00000038 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: [last unloaded: arm_spe_pmu] PC is at ctx_resched+0x38/0xe8 LR is at perf_event_exec+0x20c/0x278 [...] ctx_resched+0x38/0xe8 perf_event_exec+0x20c/0x278 setup_new_exec+0x88/0x118 load_elf_binary+0x26c/0x109c search_binary_handler+0x90/0x298 do_execveat_common.isra.14+0x540/0x618 SyS_execve+0x38/0x48 since the software context has been freed and the ctx.pmu->pmu_disable_count field has been set to NULL. This patch fixes the problem by avoiding the freeing of static PMU contexts altogether. Whilst the sharing of dynamic contexts is questionable, this actually requires the caller to share their context pointer explicitly and so the burden is on them to manage the object lifetime. Reported-by: Kim Phillips Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") Link: http://lkml.kernel.org/r/1507040450-7730-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..243bfc68d0fe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) static void free_pmu_context(struct pmu *pmu) { + /* + * Static contexts such as perf_sw_context have a global lifetime + * and may be shared between different PMUs. Avoid freeing them + * when a single PMU is going away. + */ + if (pmu->task_ctx_nr > perf_invalid_context) + return; + mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); mutex_unlock(&pmus_lock); From e6a5203399d19871021c1fa0eb2a08fc63b67e91 Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Fri, 29 Sep 2017 13:54:44 +0800 Subject: [PATCH 116/217] perf/core: Fix cgroup time when scheduling descendants Update cgroup time when an event is scheduled in by descendants. Reviewed-and-tested-by: Jiri Olsa Signed-off-by: leilei.lin Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: brendan.d.gregg@gmail.com Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/CALPjY3mkHiekRkRECzMi9G-bjUQOvOjVBAqxmWkTzc-g+0LwMg@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 243bfc68d0fe..9d93db81fa36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgrp == event->cgrp) + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) __update_cgrp_time(event->cgrp); } From d153b153446f7d8832bb2ebd92309c8a6003b3bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Sep 2017 11:35:30 +0200 Subject: [PATCH 117/217] sched/core: Fix wake_affine() performance regression Eric reported a sysbench regression against commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman Signed-off-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Matthew Rosato Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jinpuwang@gmail.com Cc: vcaputo@pengaru.com Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 8 --- kernel/sched/fair.c | 126 ++++----------------------------- kernel/sched/features.h | 1 + 3 files changed, 16 insertions(+), 119 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index d7b6dab956ec..7d065abc7a47 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -71,14 +71,6 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - - /* - * Some variables from the most recent sd_lb_stats for this domain, - * used by wake_affine(). - */ - unsigned long nr_running; - unsigned long load; - unsigned long capacity; }; struct sched_domain { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..28cabed85387 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p) return 1; } -struct llc_stats { - unsigned long nr_running; - unsigned long load; - unsigned long capacity; - int has_capacity; -}; - -static bool get_llc_stats(struct llc_stats *stats, int cpu) -{ - struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - - if (!sds) - return false; - - stats->nr_running = READ_ONCE(sds->nr_running); - stats->load = READ_ONCE(sds->load); - stats->capacity = READ_ONCE(sds->capacity); - stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); - - return true; -} - /* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous + * CPU. * - * Since we're running on 'stale' values, we might in fact create an imbalance - * but recomputing these values is expensive, as that'd mean iteration 2 cache - * domains worth of CPUs. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or + * will be) idle. */ + static bool -wake_affine_llc(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +wake_affine_idle(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) { - struct llc_stats prev_stats, this_stats; - s64 this_eff_load, prev_eff_load; - unsigned long task_load; - - if (!get_llc_stats(&prev_stats, prev_cpu) || - !get_llc_stats(&this_stats, this_cpu)) - return false; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current LLC. - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - /* in this case load hits 0 and this LLC is considered 'idle' */ - if (current_load > this_stats.load) - return true; - - this_stats.load -= current_load; - } - - /* - * The has_capacity stuff is not SMT aware, but by trying to balance - * the nr_running on both ends we try and fill the domain at equal - * rates, thereby first consuming cores before siblings. - */ - - /* if the old cache has capacity, stay there */ - if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) - return false; - - /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) + if (idle_cpu(this_cpu)) return true; - /* - * Check to see if we can move the load without causing too much - * imbalance. - */ - task_load = task_h_load(p); + if (sync && cpu_rq(this_cpu)->nr_running == 1) + return true; - this_eff_load = 100; - this_eff_load *= prev_stats.capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_stats.capacity; - - this_eff_load *= this_stats.load + task_load; - prev_eff_load *= prev_stats.load - task_load; - - return this_eff_load <= prev_eff_load; + return false; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine; + bool affine = false; - /* - * Default to no affine wakeups; wake_affine() should not effect a task - * placement the load-balancer feels inclined to undo. The conservative - * option is therefore to not move tasks when they wake up. - */ - affine = false; - - /* - * If the wakeup is across cache domains, try to evaluate if movement - * makes sense, otherwise rely on select_idle_siblings() to do - * placement inside the cache domain. - */ - if (!cpus_share_cache(prev_cpu, this_cpu)) - affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE) && !affine) + affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7672,22 +7592,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; } - - if (!shared) - return; - - /* - * Since these are sums over groups they can contain some CPUs - * multiple times for the NUMA domains. - * - * Currently only wake_affine_llc() and find_busiest_group() - * uses these numbers, only the last is affected by this problem. - * - * XXX fix that. - */ - WRITE_ONCE(shared->nr_running, sds->total_running); - WRITE_ONCE(shared->load, sds->total_load); - WRITE_ONCE(shared->capacity, sds->total_capacity); } /** diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..0a519f8c224d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(WA_IDLE, true) From f2cdd9cc6c97e617b95f430f527a6e3165e1bee8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Oct 2017 09:23:24 +0200 Subject: [PATCH 118/217] sched/core: Address more wake_affine() regressions The trivial wake_affine_idle() implementation is very good for a number of workloads, but it comes apart at the moment there are no idle CPUs left, IOW. the overloaded case. hackbench: NO_WA_WEIGHT WA_WEIGHT hackbench-20 : 7.362717561 seconds 6.450509391 seconds (win) netperf: NO_WA_WEIGHT WA_WEIGHT TCP_SENDFILE-1 : Avg: 54524.6 Avg: 52224.3 TCP_SENDFILE-10 : Avg: 48185.2 Avg: 46504.3 TCP_SENDFILE-20 : Avg: 29031.2 Avg: 28610.3 TCP_SENDFILE-40 : Avg: 9819.72 Avg: 9253.12 TCP_SENDFILE-80 : Avg: 5355.3 Avg: 4687.4 TCP_STREAM-1 : Avg: 41448.3 Avg: 42254 TCP_STREAM-10 : Avg: 24123.2 Avg: 25847.9 TCP_STREAM-20 : Avg: 15834.5 Avg: 18374.4 TCP_STREAM-40 : Avg: 5583.91 Avg: 5599.57 TCP_STREAM-80 : Avg: 2329.66 Avg: 2726.41 TCP_RR-1 : Avg: 80473.5 Avg: 82638.8 TCP_RR-10 : Avg: 72660.5 Avg: 73265.1 TCP_RR-20 : Avg: 52607.1 Avg: 52634.5 TCP_RR-40 : Avg: 57199.2 Avg: 56302.3 TCP_RR-80 : Avg: 25330.3 Avg: 26867.9 UDP_RR-1 : Avg: 108266 Avg: 107844 UDP_RR-10 : Avg: 95480 Avg: 95245.2 UDP_RR-20 : Avg: 68770.8 Avg: 68673.7 UDP_RR-40 : Avg: 76231 Avg: 75419.1 UDP_RR-80 : Avg: 34578.3 Avg: 35639.1 UDP_STREAM-1 : Avg: 64684.3 Avg: 66606 UDP_STREAM-10 : Avg: 52701.2 Avg: 52959.5 UDP_STREAM-20 : Avg: 30376.4 Avg: 29704 UDP_STREAM-40 : Avg: 15685.8 Avg: 15266.5 UDP_STREAM-80 : Avg: 8415.13 Avg: 7388.97 (wins and losses) sysbench: NO_WA_WEIGHT WA_WEIGHT sysbench-mysql-2 : 2135.17 per sec. 2142.51 per sec. sysbench-mysql-5 : 4809.68 per sec. 4800.19 per sec. sysbench-mysql-10 : 9158.59 per sec. 9157.05 per sec. sysbench-mysql-20 : 14570.70 per sec. 14543.55 per sec. sysbench-mysql-40 : 22130.56 per sec. 22184.82 per sec. sysbench-mysql-80 : 20995.56 per sec. 21904.18 per sec. sysbench-psql-2 : 1679.58 per sec. 1705.06 per sec. sysbench-psql-5 : 3797.69 per sec. 3879.93 per sec. sysbench-psql-10 : 7253.22 per sec. 7258.06 per sec. sysbench-psql-20 : 11166.75 per sec. 11220.00 per sec. sysbench-psql-40 : 17277.28 per sec. 17359.78 per sec. sysbench-psql-80 : 17112.44 per sec. 17221.16 per sec. (increase on the top end) tbench: NO_WA_WEIGHT Throughput 685.211 MB/sec 2 clients 2 procs max_latency=0.123 ms Throughput 1596.64 MB/sec 5 clients 5 procs max_latency=0.119 ms Throughput 2985.47 MB/sec 10 clients 10 procs max_latency=0.262 ms Throughput 4521.15 MB/sec 20 clients 20 procs max_latency=0.506 ms Throughput 9438.1 MB/sec 40 clients 40 procs max_latency=2.052 ms Throughput 8210.5 MB/sec 80 clients 80 procs max_latency=8.310 ms WA_WEIGHT Throughput 697.292 MB/sec 2 clients 2 procs max_latency=0.127 ms Throughput 1596.48 MB/sec 5 clients 5 procs max_latency=0.080 ms Throughput 2975.22 MB/sec 10 clients 10 procs max_latency=0.254 ms Throughput 4575.14 MB/sec 20 clients 20 procs max_latency=0.502 ms Throughput 9468.65 MB/sec 40 clients 40 procs max_latency=2.069 ms Throughput 8631.73 MB/sec 80 clients 80 procs max_latency=8.605 ms (increase on the top end) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Rik van Riel Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 2 ++ 2 files changed, 43 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28cabed85387..ed2ab474ec93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5363,6 +5363,10 @@ static int wake_wide(struct task_struct *p) * * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or * will be) idle. + * + * wake_affine_weight() - considers the weight to reflect the average + * scheduling latency of the CPUs. This seems to work + * for the overloaded case. */ static bool @@ -5378,6 +5382,40 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p, return false; } +static bool +wake_affine_weight(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + this_eff_load = target_load(this_cpu, sd->wake_idx); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); + + if (sync) { + unsigned long current_load = task_h_load(current); + + if (current_load > this_eff_load) + return true; + + this_eff_load -= current_load; + } + + task_load = task_h_load(p); + + this_eff_load += task_load; + if (sched_feat(WA_BIAS)) + this_eff_load *= 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load -= task_load; + if (sched_feat(WA_BIAS)) + prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { @@ -5387,6 +5425,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_IDLE) && !affine) affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && !affine) + affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { schedstat_inc(sd->ttwu_move_affine); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0a519f8c224d..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -82,3 +82,5 @@ SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) From 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Oct 2017 10:36:53 +0200 Subject: [PATCH 119/217] sched/core: Ensure load_balance() respects the active_mask While load_balance() masks the source CPUs against active_mask, it had a hole against the destination CPU. Ensure the destination CPU is also part of the 'domain-mask & active-mask' set. Reported-by: Levin, Alexander (Sasha Levin) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed2ab474ec93..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8042,6 +8042,13 @@ static int should_we_balance(struct lb_env *env) struct sched_group *sg = env->sd->groups; int cpu, balance_cpu = -1; + /* + * Ensure the balancing environment is consistent; can happen + * when the softirq triggers 'during' hotplug. + */ + if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) + return 0; + /* * In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. From b0490a04e736356e427e227902b17f9927a56caf Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Tue, 10 Oct 2017 12:15:30 +0530 Subject: [PATCH 120/217] powerpc/lib/sstep: Fix count leading zeros instructions According to the GCC documentation, the behaviour of __builtin_clz() and __builtin_clzl() is undefined if the value of the input argument is zero. Without handling this special case, these builtins have been used for emulating the following instructions: * Count Leading Zeros Word (cntlzw[.]) * Count Leading Zeros Doubleword (cntlzd[.]) This fixes the emulated behaviour of these instructions by adding an additional check for this special case. Fixes: 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Sandipan Das Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 5e8418c28bd8..f208f560aecd 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1684,11 +1684,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, * Logical instructions */ case 26: /* cntlzw */ - op->val = __builtin_clz((unsigned int) regs->gpr[rd]); + val = (unsigned int) regs->gpr[rd]; + op->val = ( val ? __builtin_clz(val) : 32 ); goto logical_done; #ifdef __powerpc64__ case 58: /* cntlzd */ - op->val = __builtin_clzl(regs->gpr[rd]); + val = regs->gpr[rd]; + op->val = ( val ? __builtin_clzl(val) : 64 ); goto logical_done; #endif case 28: /* and */ From 6b2c08f989250c54f31b53dba9ace863a1f3fff6 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 4 Oct 2017 21:04:30 -0300 Subject: [PATCH 121/217] powerpc: Don't call lockdep_assert_cpus_held() from arch_update_cpu_topology() It turns out that not all paths calling arch_update_cpu_topology() hold cpu_hotplug_lock, but that's OK because those paths can't race with any concurrent hotplug events. Warnings were reported with the following trace: lockdep_assert_cpus_held arch_update_cpu_topology sched_init_domains sched_init_smp kernel_init_freeable kernel_init ret_from_kernel_thread Which is safe because it's called early in boot when hotplug is not live yet. And also this trace: lockdep_assert_cpus_held arch_update_cpu_topology partition_sched_domains cpuset_update_active_cpus sched_cpu_deactivate cpuhp_invoke_callback cpuhp_down_callbacks cpuhp_thread_fun smpboot_thread_fn kthread ret_from_kernel_thread Which is safe because it's called as part of CPU hotplug, so although we don't hold the CPU hotplug lock, there is another thread driving the CPU hotplug operation which does hold the lock, and there is no race. Thanks to tglx for deciphering it for us. Fixes: 3e401f7a2e51 ("powerpc: Only obtain cpu_hotplug_lock if called by rtasd") Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b95c584ce19d..a51df9ef529d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1438,7 +1438,6 @@ int numa_update_cpu_topology(bool cpus_locked) int arch_update_cpu_topology(void) { - lockdep_assert_cpus_held(); return numa_update_cpu_topology(true); } From 62dd86ac01f9fb6386d7f8c6b389c3ea4582a50a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:02 -0500 Subject: [PATCH 122/217] x86/unwind: Fix dereference of untrusted pointer Tetsuo Handa and Fengguang Wu reported a panic in the unwinder: BUG: unable to handle kernel NULL pointer dereference at 000001f2 IP: update_stack_state+0xd4/0x340 *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP CPU: 0 PID: 18728 Comm: 01-cpu-hotplug Not tainted 4.13.0-rc4-00170-gb09be67 #592 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014 task: bb0b53c0 task.stack: bb3ac000 EIP: update_stack_state+0xd4/0x340 EFLAGS: 00010002 CPU: 0 EAX: 0000a570 EBX: bb3adccb ECX: 0000f401 EDX: 0000a570 ESI: 00000001 EDI: 000001ba EBP: bb3adc6b ESP: bb3adc3f DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 000001f2 CR3: 0b3a7000 CR4: 00140690 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: fffe0ff0 DR7: 00000400 Call Trace: ? unwind_next_frame+0xea/0x400 ? __unwind_start+0xf5/0x180 ? __save_stack_trace+0x81/0x160 ? save_stack_trace+0x20/0x30 ? __lock_acquire+0xfa5/0x12f0 ? lock_acquire+0x1c2/0x230 ? tick_periodic+0x3a/0xf0 ? _raw_spin_lock+0x42/0x50 ? tick_periodic+0x3a/0xf0 ? tick_periodic+0x3a/0xf0 ? debug_smp_processor_id+0x12/0x20 ? tick_handle_periodic+0x23/0xc0 ? local_apic_timer_interrupt+0x63/0x70 ? smp_trace_apic_timer_interrupt+0x235/0x6a0 ? trace_apic_timer_interrupt+0x37/0x3c ? strrchr+0x23/0x50 Code: 0f 95 c1 89 c7 89 45 e4 0f b6 c1 89 c6 89 45 dc 8b 04 85 98 cb 74 bc 88 4d e3 89 45 f0 83 c0 01 84 c9 89 04 b5 98 cb 74 bc 74 3b <8b> 47 38 8b 57 34 c6 43 1d 01 25 00 00 02 00 83 e2 03 09 d0 83 EIP: update_stack_state+0xd4/0x340 SS:ESP: 0068:bb3adc3f CR2: 00000000000001f2 ---[ end trace 0d147fd4aba8ff50 ]--- Kernel panic - not syncing: Fatal exception in interrupt On x86-32, after decoding a frame pointer to get a regs address, regs_size() dereferences the regs pointer when it checks regs->cs to see if the regs are user mode. This is dangerous because it's possible that what looks like a decoded frame pointer is actually a corrupt value, and we don't want the unwinder to make things worse. Instead of calling regs_size() on an unsafe pointer, just assume they're kernel regs to start with. Later, once it's safe to access the regs, we can do the user mode check and corresponding safety check for the remaining two regs. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5ed8d8bb38c5 ("x86/unwind: Move common code into update_stack_state()") Link: http://lkml.kernel.org/r/7f95b9a6993dec7674b3f3ab3dcd3294f7b9644d.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d145a0b1f529..d05637726c10 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -184,6 +184,12 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#ifdef CONFIG_X86_32 +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) +#else +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) +#endif + static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -202,7 +208,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = regs_size(regs); + len = KERNEL_REGS_SIZE; state->got_irq = true; } else { frame = next_bp; @@ -226,6 +232,14 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; + /* + * On 32-bit with user mode regs, make sure the last two regs are safe + * to access: + */ + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && + !on_stack(info, frame, len + 2*sizeof(long))) + return false; + /* Move state to the next frame: */ if (regs) { state->regs = regs; From 5c99b692cfd62f6915ed7ee7ed3c38d65ba3feab Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:03 -0500 Subject: [PATCH 123/217] x86/unwind: Use MSB for frame pointer encoding on 32-bit On x86-32, Tetsuo Handa and Fengguang Wu reported unwinder warnings like: WARNING: kernel stack regs at f60bb9c8 in swapper:1 has bad 'bp' value 0ba00000 And also there were some stack dumps with a bunch of unreliable '?' symbols after an apic_timer_interrupt symbol, meaning the unwinder got confused when it tried to read the regs. The cause of those issues is that, with GCC 4.8 (and possibly older), there are cases where GCC misaligns the stack pointer in a leaf function for no apparent reason: c124a388 : c124a388: 55 push %ebp c124a389: 89 e5 mov %esp,%ebp c124a38b: 57 push %edi c124a38c: 56 push %esi c124a38d: 89 d6 mov %edx,%esi c124a38f: 53 push %ebx c124a390: 31 db xor %ebx,%ebx c124a392: 83 ec 03 sub $0x3,%esp ... c124a3e3: 83 c4 03 add $0x3,%esp c124a3e6: 5b pop %ebx c124a3e7: 5e pop %esi c124a3e8: 5f pop %edi c124a3e9: 5d pop %ebp c124a3ea: c3 ret If an interrupt occurs in such a function, the regs on the stack will be unaligned, which breaks the frame pointer encoding assumption. So on 32-bit, use the MSB instead of the LSB to encode the regs. This isn't an issue on 64-bit, because interrupts align the stack before writing to it. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/279a26996a482ca716605c7dbc7f2db9d8d91e81.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/kernel/unwind_frame.c | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 8a13d468635a..50e0d2bc4528 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -176,7 +176,7 @@ /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also + * is just clearing the MSB, which makes it an invalid stack address and is also * a signal to the unwinder that it's a pt_regs pointer in disguise. * * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the @@ -185,7 +185,7 @@ .macro ENCODE_FRAME_POINTER #ifdef CONFIG_FRAME_POINTER mov %esp, %ebp - orl $0x1, %ebp + andl $0x7fffffff, %ebp #endif .endm diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d05637726c10..4949bbc95f75 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -174,6 +174,7 @@ static bool is_last_task_frame(struct unwind_state *state) * This determines if the frame pointer actually contains an encoded pointer to * pt_regs on the stack. See ENCODE_FRAME_POINTER. */ +#ifdef CONFIG_X86_64 static struct pt_regs *decode_frame_pointer(unsigned long *bp) { unsigned long regs = (unsigned long)bp; @@ -183,6 +184,17 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#else +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (regs & 0x80000000) + return NULL; + + return (struct pt_regs *)(regs | 0x80000000); +} +#endif #ifdef CONFIG_X86_32 #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) From 99bd28a49b150e4b938313a63b5532d95ba77885 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:04 -0500 Subject: [PATCH 124/217] x86/unwind: Align stack pointer in unwinder dump When printing the unwinder dump, the stack pointer could be unaligned, for one of two reasons: - stack corruption; or - GCC created an unaligned stack. There's no way for the unwinder to tell the difference between the two, so we have to assume one or the other. GCC unaligned stacks are very rare, and have only been spotted before GCC 5. Presumably, if we're doing an unwinder stack dump, stack corruption is more likely than a GCC unaligned stack. So always align the stack before starting the dump. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2f540c515946ab09ed267e1a1d6421202a0cce08.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 4949bbc95f75..81aca077fbb6 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state) state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) break; From d4a2d031dd42f9594107d317e2a9a0c6d73ad46b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:05 -0500 Subject: [PATCH 125/217] x86/unwind: Disable unwinder warnings on 32-bit x86-32 doesn't have stack validation, so in most cases it doesn't make sense to warn about bad frame pointers. Reported-by: Tetsuo Handa Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: Fengguang Wu Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a69658760800bf281e6353248c23e0fa0acf5230.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 81aca077fbb6..3dc26f95d46e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -355,6 +355,13 @@ bool unwind_next_frame(struct unwind_state *state) state->regs->sp < (unsigned long)task_pt_regs(state->task)) goto the_end; + /* + * There are some known frame pointer issues on 32-bit. Disable + * unwinder warnings on 32-bit until it gets objtool support. + */ + if (IS_ENABLED(CONFIG_X86_32)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", From 629eb703d3e46aa570c6c91235d38fd09ed8c58b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 9 Oct 2017 18:26:55 +0100 Subject: [PATCH 126/217] perf/x86/intel/uncore: Fix memory leaks on allocation failures Currently if an allocation fails then the error return paths don't free up any currently allocated pmus[].boxes and pmus causing a memory leak. Add an error clean up exit path that frees these objects. Detected by CoverityScan, CID#711632 ("Resource Leak") Signed-off-by: Colin Ian King Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Fixes: 087bfbb03269 ("perf/x86: Add generic Intel uncore PMU support") Link: http://lkml.kernel.org/r/20171009172655.6132-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 1c5390f1cf09..d45e06346f14 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); if (!pmus[i].boxes) - return -ENOMEM; + goto err; } type->pmus = pmus; @@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + sizeof(*attr_group), GFP_KERNEL); if (!attr_group) - return -ENOMEM; + goto err; attrs = (struct attribute **)(attr_group + 1); attr_group->name = "events"; @@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } type->pmu_group = &uncore_pmu_attr_group; + return 0; + +err: + for (i = 0; i < type->num_boxes; i++) + kfree(pmus[i].boxes); + kfree(pmus); + + return -ENOMEM; } static int __init From a3b7424392924e778b608e30ee321f7b10cc94b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Oct 2017 17:48:54 +0200 Subject: [PATCH 127/217] x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs hv_flush_pcpu_ex structures are not cleared between calls for performance reasons (they're variable size up to PAGE_SIZE each) but we must clear hv_vp_set.bank_contents part of it to avoid flushing unneeded vCPUs. The rest of the structure is formed correctly. To do the clearing in an efficient way stash the maximum possible vCPU number (this may differ from Linux CPU id). Reported-by: Jork Loeser Signed-off-by: Vitaly Kuznetsov Cc: Dexuan Cui Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/20171006154854.18092-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_init.c | 5 +++++ arch/x86/hyperv/mmu.c | 17 ++++++++++++----- arch/x86/include/asm/mshyperv.h | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a8eb550c40f..a5db63f728a2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); +u32 hv_max_vp_index; + static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu) hv_vp_index[smp_processor_id()] = msr_vp_index; + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + return 0; } diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 39e7f6e50919..9502d04c0c95 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, { int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + /* * Some banks may end up being empty but this is acceptable. */ @@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, vcpu = hv_cpu_number_to_vp_number(cpu); vcpu_bank = vcpu / 64; vcpu_offset = vcpu % 64; - - /* valid_bank_mask can represent up to 64 banks */ - if (vcpu_bank >= 64) - return 0; - __set_bit(vcpu_offset, (unsigned long *) &flush->hv_vp_set.bank_contents[vcpu_bank]); if (vcpu_bank >= nr_bank) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 738503e1f80c..530f448fddaf 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, * to this information. */ extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; /** * hv_cpu_number_to_vp_number() - Map CPU to VP. From 60d73a7c96601434dfdb56d5b9167ff3b850d8d7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 5 Oct 2017 13:39:24 +0200 Subject: [PATCH 128/217] x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures hv_do_hypercall() does virt_to_phys() translation and with some configs (CONFIG_SLAB) this doesn't work for percpu areas, we pass wrong memory to hypervisor and get #GP. We could use working slow_virt_to_phys() instead but doing so kills the performance. Move pcpu_flush/pcpu_flush_ex structures out of percpu areas and allocate memory on first call. The additional level of indirection gives us a small performance penalty, in future we may consider introducing hypercall functions which avoid virt_to_phys() conversion and cache physical addresses of pcpu_flush/pcpu_flush_ex structures somewhere. Reported-by: Simon Xiao Signed-off-by: Vitaly Kuznetsov Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Jork Loeser Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/20171005113924.28021-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/mmu.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 9502d04c0c95..f21cebbb5f6c 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex { /* Each gva in gva_list encodes up to 4096 pages to flush */ #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) -static struct hv_flush_pcpu __percpu *pcpu_flush; +static struct hv_flush_pcpu __percpu **pcpu_flush; -static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; /* * Fills in gva_list starting from offset. Returns the number of items added. @@ -109,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; struct hv_flush_pcpu *flush; u64 status = U64_MAX; unsigned long flags; @@ -123,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush); + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -180,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; struct hv_flush_pcpu_ex *flush; u64 status = U64_MAX; unsigned long flags; @@ -194,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush_ex); + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -273,7 +295,7 @@ void hyper_alloc_mmu(void) return; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); else - pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); } From ab7ff471aa5db670197070760f022622793da7e5 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Cerri Date: Thu, 5 Oct 2017 10:34:29 -0300 Subject: [PATCH 129/217] x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing Do not consider the fixed size of hv_vp_set when passing the variable header size to hv_do_rep_hypercall(). The Hyper-V hypervisor specification states that for a hypercall with a variable header only the size of the variable portion should be supplied via the input control. For HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX/LIST_EX calls that means the fixed portion of hv_vp_set should not be considered. That fixes random failures of some applications that are unexpectedly killed with SIGBUS or SIGSEGV. Signed-off-by: Marcelo Henrique Cerri Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Jork Loeser Cc: Josh Poulson Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Simon Xiao Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: devel@linuxdriverproject.org Fixes: 628f54cc6451 ("x86/hyper-v: Support extended CPU ranges for TLB flush hypercalls") Link: http://lkml.kernel.org/r/1507210469-29065-1-git-send-email-marcelo.cerri@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index f21cebbb5f6c..9cc9e1c1e2db 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -251,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else if (info->end && ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else { gva_n = fill_gva_list(flush->gva_list, nr_bank, info->start, info->end); status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, - gva_n, nr_bank + 2, flush, NULL); + gva_n, nr_bank, flush, NULL); } local_irq_restore(flags); From 124751d5e63c823092060074bd0abaae61aaa9c4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Oct 2017 14:10:32 +0200 Subject: [PATCH 130/217] ALSA: usb-audio: Kill stray URB at exiting USB-audio driver may leave a stray URB for the mixer interrupt when it exits by some error during probe. This leads to a use-after-free error as spotted by syzkaller like: ================================================================== BUG: KASAN: use-after-free in snd_usb_mixer_interrupt+0x604/0x6f0 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x23d/0x350 mm/kasan/report.c:409 __asan_report_load8_noabort+0x19/0x20 mm/kasan/report.c:430 snd_usb_mixer_interrupt+0x604/0x6f0 sound/usb/mixer.c:2490 __usb_hcd_giveback_urb+0x2e0/0x650 drivers/usb/core/hcd.c:1779 .... Allocated by task 1484: save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772 kmalloc ./include/linux/slab.h:493 kzalloc ./include/linux/slab.h:666 snd_usb_create_mixer+0x145/0x1010 sound/usb/mixer.c:2540 create_standard_mixer_quirk+0x58/0x80 sound/usb/quirks.c:516 snd_usb_create_quirk+0x92/0x100 sound/usb/quirks.c:560 create_composite_quirk+0x1c4/0x3e0 sound/usb/quirks.c:59 snd_usb_create_quirk+0x92/0x100 sound/usb/quirks.c:560 usb_audio_probe+0x1040/0x2c10 sound/usb/card.c:618 .... Freed by task 1484: save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 kasan_slab_free+0x72/0xc0 mm/kasan/kasan.c:524 slab_free_hook mm/slub.c:1390 slab_free_freelist_hook mm/slub.c:1412 slab_free mm/slub.c:2988 kfree+0xf6/0x2f0 mm/slub.c:3919 snd_usb_mixer_free+0x11a/0x160 sound/usb/mixer.c:2244 snd_usb_mixer_dev_free+0x36/0x50 sound/usb/mixer.c:2250 __snd_device_free+0x1ff/0x380 sound/core/device.c:91 snd_device_free_all+0x8f/0xe0 sound/core/device.c:244 snd_card_do_free sound/core/init.c:461 release_card_device+0x47/0x170 sound/core/init.c:181 device_release+0x13f/0x210 drivers/base/core.c:814 .... Actually such a URB is killed properly at disconnection when the device gets probed successfully, and what we need is to apply it for the error-path, too. In this patch, we apply snd_usb_mixer_disconnect() at releasing. Also introduce a new flag, disconnected, to struct usb_mixer_interface for not performing the disconnection procedure twice. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 12 ++++++++++-- sound/usb/mixer.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 9732edf77f86..91bc8f18791e 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2234,6 +2234,9 @@ static int parse_audio_unit(struct mixer_build *state, int unitid) static void snd_usb_mixer_free(struct usb_mixer_interface *mixer) { + /* kill pending URBs */ + snd_usb_mixer_disconnect(mixer); + kfree(mixer->id_elems); if (mixer->urb) { kfree(mixer->urb->transfer_buffer); @@ -2584,8 +2587,13 @@ int snd_usb_create_mixer(struct snd_usb_audio *chip, int ctrlif, void snd_usb_mixer_disconnect(struct usb_mixer_interface *mixer) { - usb_kill_urb(mixer->urb); - usb_kill_urb(mixer->rc_urb); + if (mixer->disconnected) + return; + if (mixer->urb) + usb_kill_urb(mixer->urb); + if (mixer->rc_urb) + usb_kill_urb(mixer->rc_urb); + mixer->disconnected = true; } #ifdef CONFIG_PM diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 2b4b067646ab..545d99b09706 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -22,6 +22,8 @@ struct usb_mixer_interface { struct urb *rc_urb; struct usb_ctrlrequest *rc_setup_packet; u8 rc_buffer[6]; + + bool disconnected; }; #define MAX_CHANNELS 16 /* max logical channels */ From eac779aa509d453a55da0ea4302bdb79c4e0854f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 8 Oct 2017 19:58:46 -0700 Subject: [PATCH 131/217] xen/vcpu: Use a unified name about cpu hotplug state for pv and pvhvm As xen_cpuhp_setup is called by PV and PVHVM, the name of "x86/xen/hvm_guest" is confusing. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0e7ef69e8531..d669e9d89001 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", + "x86/xen/guest:prepare", cpu_up_prepare_cb, cpu_dead_cb); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", + "x86/xen/guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); From fd19d3b45164466a4adce7cbff448ba9189e1427 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 5 Oct 2017 11:10:22 +0200 Subject: [PATCH 132/217] KVM: nVMX: update last_nonleaf_level when initializing nested EPT The function updates context->root_level but didn't call update_last_nonleaf_level so the previous and potentially wrong value was used for page walks. For example, a zero value of last_nonleaf_level would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's walk_addr_generic function (CVE-2017-12188). Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 106d4a029a8a..3c25f20115bc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } From 829ee279aed43faa5cb1e4d65c0cad52f2426c53 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 5 Oct 2017 11:10:23 +0200 Subject: [PATCH 133/217] KVM: MMU: always terminate page walks at level 1 is_last_gpte() is not equivalent to the pseudo-code given in commit 6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect value of last_nonleaf_level may override the result even if level == 1. It is critical for is_last_gpte() to return true on level == 1 to terminate page walks. Otherwise memory corruption may occur as level is used as an index to various data structures throughout the page walking code. Even though the actual bug would be wherever the MMU is initialized (as in the previous patch), be defensive and ensure here that is_last_gpte() returns the correct value. This patch is also enough to fix CVE-2017-12188. Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2 Cc: stable@vger.kernel.org Cc: Andy Honig Signed-off-by: Ladi Prosek [Panic if walk_addr_generic gets an incorrect level; this is a serious bug and it's not worth a WARN_ON where the recovery path might hide further exploitable issues; suggested by Andrew Honig. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3c25f20115bc..7a69cf053711 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3973,13 +3973,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { - /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear @@ -3987,6 +3980,13 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 86b68dc5a649..f18d1f8d332b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -334,10 +334,11 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; From ac3d79392f8c2728f7600dd32ed88b3a1bfdc1af Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Oct 2017 14:40:42 +0200 Subject: [PATCH 134/217] quota: Generate warnings for DQUOT_SPACE_NOFAIL allocations Eryu has reported that since commit 7b9ca4c61bc2 "quota: Reduce contention on dq_data_lock" test generic/233 occasionally fails. This is caused by the fact that since that commit we don't generate warning and set grace time for quota allocations that have DQUOT_SPACE_NOFAIL set (these are for example some metadata allocations in ext4). We need these allocations to behave regularly wrt warning generation and grace time setting so fix the code to return to the original behavior. Reported-and-tested-by: Eryu Guan CC: stable@vger.kernel.org Fixes: 7b9ca4c61bc278b771fb57d6290a31ab1fc7fdac Signed-off-by: Jan Kara --- fs/quota/dquot.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 50b0556a124f..52ad15192e72 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1297,21 +1297,18 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - goto add; + goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; - if (flags & DQUOT_SPACE_NOFAIL) - goto add; - if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1322,7 +1319,7 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1338,13 +1335,21 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, * be always printed */ ret = -EDQUOT; - goto out; + goto finish; } } -add: - dquot->dq_dqb.dqb_rsvspace += rsv_space; - dquot->dq_dqb.dqb_curspace += space; -out: +finish: + /* + * We have to be careful and go through warning generation & grace time + * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it + * only here... + */ + if (flags & DQUOT_SPACE_NOFAIL) + ret = 0; + if (!ret) { + dquot->dq_dqb.dqb_rsvspace += rsv_space; + dquot->dq_dqb.dqb_curspace += space; + } spin_unlock(&dquot->dq_dqb_lock); return ret; } From b61907bb42409adf9b3120f741af7c57dd7e3db2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 9 Oct 2017 23:30:02 +0800 Subject: [PATCH 135/217] crypto: shash - Fix zero-length shash ahash digest crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shash ahash digest adaptor function may crash if given a zero-length input together with a null SG list. This is because it tries to read the SG list before looking at the length. This patch fixes it by checking the length first. Cc: Reported-by: Stephan Müller Signed-off-by: Herbert Xu Tested-by: Stephan Müller --- crypto/shash.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crypto/shash.c b/crypto/shash.c index 8fcecc66741d..325a14da5827 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -275,12 +275,14 @@ static int shash_async_finup(struct ahash_request *req) int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { - struct scatterlist *sg = req->src; - unsigned int offset = sg->offset; unsigned int nbytes = req->nbytes; + struct scatterlist *sg; + unsigned int offset; int err; - if (nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset)) { + if (nbytes && + (sg = req->src, offset = sg->offset, + nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; data = kmap_atomic(sg_page(sg)); From aba2d9a6385a5cc4f7a7e8eb5788e1ddbc213fc0 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 6 Oct 2017 16:35:40 -0500 Subject: [PATCH 136/217] iommu/amd: Do not disable SWIOTLB if SME is active When SME memory encryption is active it will rely on SWIOTLB to handle DMA for devices that cannot support the addressing requirements of having the encryption mask set in the physical address. The IOMMU currently disables SWIOTLB if it is not running in passthrough mode. This is not desired as non-PCI devices attempting DMA may fail. Update the code to check if SME is active and not disable SWIOTLB. Fixes: 2543a786aa25 ("iommu/amd: Allow the AMD IOMMU to work with memory encryption") Signed-off-by: Tom Lendacky Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 51f8215877f5..822679ac90a1 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2773,14 +2773,16 @@ int __init amd_iommu_init_api(void) int __init amd_iommu_init_dma_ops(void) { - swiotlb = iommu_pass_through ? 1 : 0; + swiotlb = (iommu_pass_through || sme_me_mask) ? 1 : 0; iommu_detected = 1; /* * In case we don't initialize SWIOTLB (actually the common case - * when AMD IOMMU is enabled), make sure there are global - * dma_ops set as a fall-back for devices not handled by this - * driver (for example non-PCI devices). + * when AMD IOMMU is enabled and SME is not active), make sure there + * are global dma_ops set as a fall-back for devices not handled by + * this driver (for example non-PCI devices). When SME is active, + * make sure that swiotlb variable remains set so the global dma_ops + * continue to be SWIOTLB. */ if (!swiotlb) dma_ops = &nommu_dma_ops; From 0a7480bd327afcccd7263be5b485f85943e1e903 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 Sep 2017 13:33:45 +0300 Subject: [PATCH 137/217] rpmsg: glink: Unlock on error in qcom_glink_request_intent() If qcom_glink_tx() fails, then we need to unlock before returning the error code. Fixes: 27b9c5b66b23 ("rpmsg: glink: Request for intents when unavailable") Acked-by: Sricharan R Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Andersson --- drivers/rpmsg/qcom_glink_native.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c index 5a5e927ea50f..fecb1dafa8f3 100644 --- a/drivers/rpmsg/qcom_glink_native.c +++ b/drivers/rpmsg/qcom_glink_native.c @@ -1197,7 +1197,7 @@ static int qcom_glink_request_intent(struct qcom_glink *glink, ret = qcom_glink_tx(glink, &cmd, sizeof(cmd), NULL, 0, true); if (ret) - return ret; + goto unlock; ret = wait_for_completion_timeout(&channel->intent_req_comp, 10 * HZ); if (!ret) { @@ -1207,6 +1207,7 @@ static int qcom_glink_request_intent(struct qcom_glink *glink, ret = channel->intent_req_result ? 0 : -ECANCELED; } +unlock: mutex_unlock(&channel->intent_req_lock); return ret; } From b775d158530285c9657a1a0628c139b0dfd0d2e5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 Sep 2017 13:34:42 +0300 Subject: [PATCH 138/217] rpmsg: glink: Fix memory leak in qcom_glink_alloc_intent() We need to free "intent" and "intent->data" on a couple error paths. Fixes: 933b45da5d1d ("rpmsg: glink: Add support for TX intents") Acked-by: Sricharan R Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Andersson --- drivers/rpmsg/qcom_glink_native.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c index fecb1dafa8f3..5dcc9bf1c5bc 100644 --- a/drivers/rpmsg/qcom_glink_native.c +++ b/drivers/rpmsg/qcom_glink_native.c @@ -635,19 +635,18 @@ qcom_glink_alloc_intent(struct qcom_glink *glink, unsigned long flags; intent = kzalloc(sizeof(*intent), GFP_KERNEL); - if (!intent) return NULL; intent->data = kzalloc(size, GFP_KERNEL); if (!intent->data) - return NULL; + goto free_intent; spin_lock_irqsave(&channel->intent_lock, flags); ret = idr_alloc_cyclic(&channel->liids, intent, 1, -1, GFP_ATOMIC); if (ret < 0) { spin_unlock_irqrestore(&channel->intent_lock, flags); - return NULL; + goto free_data; } spin_unlock_irqrestore(&channel->intent_lock, flags); @@ -656,6 +655,12 @@ qcom_glink_alloc_intent(struct qcom_glink *glink, intent->reuse = reuseable; return intent; + +free_data: + kfree(intent->data); +free_intent: + kfree(intent); + return NULL; } static void qcom_glink_handle_rx_done(struct qcom_glink *glink, From 68c2d645ebbd4a636cf93ed56f15912bcf9376bc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Oct 2017 15:58:27 +0300 Subject: [PATCH 139/217] remoteproc: imx_rproc: fix a couple off by one bugs The priv->mem[] array has IMX7D_RPROC_MEM_MAX elements so the > should be >= to avoid writing one element beyond the end of the array. Fixes: a0ff4aa6f010 ("remoteproc: imx_rproc: add a NXP/Freescale imx_rproc driver") Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 612d91403341..81ba44510b75 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -264,7 +264,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, if (!(att->flags & ATT_OWN)) continue; - if (b > IMX7D_RPROC_MEM_MAX) + if (b >= IMX7D_RPROC_MEM_MAX) break; priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, @@ -296,7 +296,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, return err; } - if (b > IMX7D_RPROC_MEM_MAX) + if (b >= IMX7D_RPROC_MEM_MAX) break; priv->mem[b].cpu_addr = devm_ioremap_resource(&pdev->dev, &res); From ab759b9732fd8a4ae0252bb2087e90d776f74b9f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Sep 2017 21:54:17 +0200 Subject: [PATCH 140/217] remoteproc: qcom: fix RPMSG_QCOM_GLINK_SMEM dependencies When RPMSG_QCOM_GLINK_SMEM=m and one driver causes the qcom_common.c file to be compiled as built-in, we get a link error: drivers/remoteproc/qcom_common.o: In function `glink_subdev_remove': qcom_common.c:(.text+0x130): undefined reference to `qcom_glink_smem_unregister' qcom_common.c:(.text+0x130): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `qcom_glink_smem_unregister' drivers/remoteproc/qcom_common.o: In function `glink_subdev_probe': qcom_common.c:(.text+0x160): undefined reference to `qcom_glink_smem_register' qcom_common.c:(.text+0x160): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `qcom_glink_smem_register' Out of the three PIL driver instances, QCOM_ADSP_PIL already has a Kconfig dependency to prevent this from happening, but the other two do not. This adds the same dependency there. Fixes: eea07023e6d9 ("remoteproc: qcom: adsp: Allow defining GLINK edge") Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Andersson --- drivers/remoteproc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index df63e44526ac..bf04479456a0 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -109,6 +109,7 @@ config QCOM_Q6V5_PIL depends on OF && ARCH_QCOM depends on QCOM_SMEM depends on RPMSG_QCOM_SMD || (COMPILE_TEST && RPMSG_QCOM_SMD=n) + depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n select MFD_SYSCON select QCOM_RPROC_COMMON select QCOM_SCM @@ -120,6 +121,7 @@ config QCOM_WCNSS_PIL tristate "Qualcomm WCNSS Peripheral Image Loader" depends on OF && ARCH_QCOM depends on RPMSG_QCOM_SMD || (COMPILE_TEST && RPMSG_QCOM_SMD=n) + depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SMEM select QCOM_MDT_LOADER select QCOM_RPROC_COMMON From 084f5601c357e4ee59cf0712200d3f5c4710ba40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 14:26:48 +0100 Subject: [PATCH 141/217] seccomp: make function __get_seccomp_filter static The function __get_seccomp_filter is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol '__get_seccomp_filter' was not declared. Should it be static? Signed-off-by: Colin Ian King Fixes: 66a733ea6b61 ("seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter()") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb3a38005b9c..0ae832e13b97 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } -void __get_seccomp_filter(struct seccomp_filter *filter) +static void __get_seccomp_filter(struct seccomp_filter *filter) { /* Reference count is bounded by the number of total processes. */ refcount_inc(&filter->usage); From 8c2b4e3c3725801b57d7b858d216d38f83bdb35d Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 9 Oct 2017 12:29:35 +0200 Subject: [PATCH 142/217] Revert "PCI: tegra: Do not allocate MSI target memory" This reverts commit d7bd554f27c942e6b8b54100b4044f9be1038edf. It turns out that Tegra20 has a bug in the implementation of the MSI target address register (which is worked around by the existence of the struct tegra_pcie_soc.msi_base_shift parameter) that restricts the MSI target memory to the lower 32 bits of physical memory on that particular generation. The offending patch causes a regression on TrimSlice, which is a Tegra20-based device and has a PCI network interface card. An initial, simpler fix was to change the MSI target address for Tegra20 only, but it was pointed out that the offending commit also prevents the use of 32-bit only MSI capable devices, even on later chips. Technically this was never guaranteed to work with the prior code in the first place because the allocated page could have resided beyond the 4 GiB boundary, but it is still possible that this could've introduced a regression. The proper fix that was settled on is to select a fixed address within the lowest 32 bits of physical address space that is otherwise unused, but testing of that patch has provided mixed results that are not fully understood yet. Given all of the above and the relative urgency to get this fixed in v4.13, revert the offending commit until a universal fix is found. Fixes: d7bd554f27c9 ("PCI: tegra: Do not allocate MSI target memory") Reported-by: Tomasz Maciej Nowak Reported-by: Erik Faye-Lund Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # 4.13.x --- drivers/pci/host/pci-tegra.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c index 9c40da54f88a..1987fec1f126 100644 --- a/drivers/pci/host/pci-tegra.c +++ b/drivers/pci/host/pci-tegra.c @@ -233,6 +233,7 @@ struct tegra_msi { struct msi_controller chip; DECLARE_BITMAP(used, INT_PCI_MSI_NR); struct irq_domain *domain; + unsigned long pages; struct mutex lock; u64 phys; int irq; @@ -1529,22 +1530,9 @@ static int tegra_pcie_enable_msi(struct tegra_pcie *pcie) goto err; } - /* - * The PCI host bridge on Tegra contains some logic that intercepts - * MSI writes, which means that the MSI target address doesn't have - * to point to actual physical memory. Rather than allocating one 4 - * KiB page of system memory that's never used, we can simply pick - * an arbitrary address within an area reserved for system memory - * in the FPCI address map. - * - * However, in order to avoid confusion, we pick an address that - * doesn't map to physical memory. The FPCI address map reserves a - * 1012 GiB region for system memory and memory-mapped I/O. Since - * none of the Tegra SoCs that contain this PCI host bridge can - * address more than 16 GiB of system memory, the last 4 KiB of - * these 1012 GiB is a good candidate. - */ - msi->phys = 0xfcfffff000; + /* setup AFI/FPCI range */ + msi->pages = __get_free_pages(GFP_KERNEL, 0); + msi->phys = virt_to_phys((void *)msi->pages); afi_writel(pcie, msi->phys >> soc->msi_base_shift, AFI_MSI_FPCI_BAR_ST); afi_writel(pcie, msi->phys, AFI_MSI_AXI_BAR_ST); @@ -1596,6 +1584,8 @@ static int tegra_pcie_disable_msi(struct tegra_pcie *pcie) afi_writel(pcie, 0, AFI_MSI_EN_VEC6); afi_writel(pcie, 0, AFI_MSI_EN_VEC7); + free_pages(msi->pages, 0); + if (msi->irq > 0) free_irq(msi->irq, pcie); From 407dae1e4415acde2d9f48bb76361893c4653756 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Mon, 9 Oct 2017 09:00:49 +0200 Subject: [PATCH 143/217] PCI: aardvark: Move to struct pci_host_bridge IRQ mapping functions struct pci_host_bridge gained hooks to map/swizzle IRQs, so that the IRQ mapping can be done automatically by PCI core code through the pci_assign_irq() function instead of resorting to arch-specific implementation callbacks to carry out the same task which force PCI host bridge drivers implementation to implement per-arch kludges to carry out a task that is inherently architecture agnostic. Commit 769b461fc0c0 ("arm64: PCI: Drop DT IRQ allocation from pcibios_alloc_irq()") was assuming all PCI host controller drivers had been converted to use ->map_irq(), but that wasn't the case: pci-aardvark had not been converted. Due to this, it broke the support for legacy PCI interrupts when using the pci-aardvark driver (used on Marvell Armada 3720 platforms). In order to fix this, we make sure the ->map_irq and ->swizzle_irq fields of pci_host_bridge are properly filled in. Fixes: 769b461fc0c0 ("arm64: PCI: Drop DT IRQ allocation from pcibios_alloc_irq()") Signed-off-by: Thomas Petazzoni Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v4.13+ --- drivers/pci/host/pci-aardvark.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c index 89f4e3d072d7..26ed0c08f209 100644 --- a/drivers/pci/host/pci-aardvark.c +++ b/drivers/pci/host/pci-aardvark.c @@ -935,6 +935,8 @@ static int advk_pcie_probe(struct platform_device *pdev) bridge->sysdata = pcie; bridge->busnr = 0; bridge->ops = &advk_pcie_ops; + bridge->map_irq = of_irq_parse_and_map_pci; + bridge->swizzle_irq = pci_common_swizzle; ret = pci_scan_root_bus_bridge(bridge); if (ret < 0) { From 899f0429c7d3eed886406cd72182bee3b96aa1f9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 9 Oct 2017 11:13:18 +0200 Subject: [PATCH 144/217] direct-io: Prevent NULL pointer access in submit_page_section In the code added to function submit_page_section by commit b1058b981, sdio->bio can currently be NULL when calling dio_bio_submit. This then leads to a NULL pointer access in dio_bio_submit, so check for a NULL bio in submit_page_section before trying to submit it instead. Fixes xfstest generic/250 on gfs2. Cc: stable@vger.kernel.org # v3.10+ Signed-off-by: Andreas Gruenbacher Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 62cf812ed0e5..96415c65bbdc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -866,7 +866,8 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); put_page(sdio->cur_page); sdio->cur_page = NULL; } From 95d78c28b5a85bacbc29b8dba7c04babb9b0d467 Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Fri, 22 Sep 2017 01:18:39 -0400 Subject: [PATCH 145/217] fix unbalanced page refcounting in bio_map_user_iov bio_map_user_iov and bio_unmap_user do unbalanced pages refcounting if IO vector has small consecutive buffers belonging to the same page. bio_add_pc_page merges them into one, but the page reference is never dropped. Cc: stable@vger.kernel.org Signed-off-by: Vitaly Mayatskikh Signed-off-by: Al Viro --- block/bio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/bio.c b/block/bio.c index b38e962fa83e..0d6439e89acb 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1383,6 +1383,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, offset = offset_in_page(uaddr); for (j = cur_page; j < page_limit; j++) { unsigned int bytes = PAGE_SIZE - offset; + unsigned short prev_bi_vcnt = bio->bi_vcnt; if (len <= 0) break; @@ -1397,6 +1398,13 @@ struct bio *bio_map_user_iov(struct request_queue *q, bytes) break; + /* + * check if vector was merged with previous + * drop page reference if needed + */ + if (bio->bi_vcnt == prev_bi_vcnt) + put_page(pages[j]); + len -= bytes; offset = 0; } From 2b04e8f6bbb196cab4b232af0f8d48ff2c7a8058 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Sep 2017 15:51:23 -0400 Subject: [PATCH 146/217] more bio_map_user_iov() leak fixes we need to take care of failure exit as well - pages already in bio should be dropped by analogue of bio_unmap_pages(), since their refcounts had been bumped only once per reference in bio. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- block/bio.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index 0d6439e89acb..9e9606d26cc6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1331,6 +1331,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, int ret, offset; struct iov_iter i; struct iovec iov; + struct bio_vec *bvec; iov_for_each(iov, i, *iter) { unsigned long uaddr = (unsigned long) iov.iov_base; @@ -1375,7 +1376,12 @@ struct bio *bio_map_user_iov(struct request_queue *q, ret = get_user_pages_fast(uaddr, local_nr_pages, (iter->type & WRITE) != WRITE, &pages[cur_page]); - if (ret < local_nr_pages) { + if (unlikely(ret < local_nr_pages)) { + for (j = cur_page; j < page_limit; j++) { + if (!pages[j]) + break; + put_page(pages[j]); + } ret = -EFAULT; goto out_unmap; } @@ -1431,10 +1437,8 @@ struct bio *bio_map_user_iov(struct request_queue *q, return bio; out_unmap: - for (j = 0; j < nr_pages; j++) { - if (!pages[j]) - break; - put_page(pages[j]); + bio_for_each_segment_all(bvec, bio, j) { + put_page(bvec->bv_page); } out: kfree(pages); From 1cfd0ddd82232804e03f3023f6a58b50dfef0574 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Sep 2017 10:21:15 -0400 Subject: [PATCH 147/217] bio_copy_user_iov(): don't ignore ->iov_offset Since "block: support large requests in blk_rq_map_user_iov" we started to call it with partially drained iter; that works fine on the write side, but reads create a copy of iter for completion time. And that needs to take the possibility of ->iov_iter != 0 into account... Cc: stable@vger.kernel.org #v4.5+ Signed-off-by: Al Viro --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 9e9606d26cc6..101c2a9b5481 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1239,8 +1239,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ bmd->is_our_pages = map_data ? 0 : 1; memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs); - iov_iter_init(&bmd->iter, iter->type, bmd->iov, - iter->nr_segs, iter->count); + bmd->iter = *iter; + bmd->iter.iov = bmd->iov; ret = -ENOMEM; bio = bio_kmalloc(gfp_mask, nr_pages); From 71105998845fb012937332fe2e806d443c09e026 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 11:09:20 +0200 Subject: [PATCH 148/217] ALSA: seq: Fix use-after-free at creating a port There is a potential race window opened at creating and deleting a port via ioctl, as spotted by fuzzing. snd_seq_create_port() creates a port object and returns its pointer, but it doesn't take the refcount, thus it can be deleted immediately by another thread. Meanwhile, snd_seq_ioctl_create_port() still calls the function snd_seq_system_client_ev_port_start() with the created port object that is being deleted, and this triggers use-after-free like: BUG: KASAN: use-after-free in snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] at addr ffff8801f2241cb1 ============================================================================= BUG kmalloc-512 (Tainted: G B ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in snd_seq_create_port+0x94/0x9b0 [snd_seq] age=1 cpu=3 pid=4511 ___slab_alloc+0x425/0x460 __slab_alloc+0x20/0x40 kmem_cache_alloc_trace+0x150/0x190 snd_seq_create_port+0x94/0x9b0 [snd_seq] snd_seq_ioctl_create_port+0xd1/0x630 [snd_seq] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] snd_seq_ioctl+0x40/0x80 [snd_seq] do_vfs_ioctl+0x54b/0xda0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x16/0x75 INFO: Freed in port_delete+0x136/0x1a0 [snd_seq] age=1 cpu=2 pid=4717 __slab_free+0x204/0x310 kfree+0x15f/0x180 port_delete+0x136/0x1a0 [snd_seq] snd_seq_delete_port+0x235/0x350 [snd_seq] snd_seq_ioctl_delete_port+0xc8/0x180 [snd_seq] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] snd_seq_ioctl+0x40/0x80 [snd_seq] do_vfs_ioctl+0x54b/0xda0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x16/0x75 Call Trace: [] dump_stack+0x63/0x82 [] print_trailer+0xfb/0x160 [] object_err+0x34/0x40 [] kasan_report.part.2+0x223/0x520 [] ? snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] [] __asan_report_load1_noabort+0x2e/0x30 [] snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] [] ? snd_seq_ioctl_delete_port+0x180/0x180 [snd_seq] [] ? taskstats_exit+0xbc0/0xbc0 [] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] [] snd_seq_ioctl+0x40/0x80 [snd_seq] [] ? acct_account_cputime+0x63/0x80 [] do_vfs_ioctl+0x54b/0xda0 ..... We may fix this in a few different ways, and in this patch, it's fixed simply by taking the refcount properly at snd_seq_create_port() and letting the caller unref the object after use. Also, there is another potential use-after-free by sprintf() call in snd_seq_create_port(), and this is moved inside the lock. This fix covers CVE-2017-15265. Reported-and-tested-by: Michael23 Yu Suggested-by: Linus Torvalds Cc: Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 6 +++++- sound/core/seq/seq_ports.c | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index ea2d0ae85bd3..6c9cba2166d9 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1259,6 +1259,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) struct snd_seq_port_info *info = arg; struct snd_seq_client_port *port; struct snd_seq_port_callback *callback; + int port_idx; /* it is not allowed to create the port for an another client */ if (info->addr.client != client->number) @@ -1269,7 +1270,9 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) return -ENOMEM; if (client->type == USER_CLIENT && info->kernel) { - snd_seq_delete_port(client, port->addr.port); + port_idx = port->addr.port; + snd_seq_port_unlock(port); + snd_seq_delete_port(client, port_idx); return -EINVAL; } if (client->type == KERNEL_CLIENT) { @@ -1290,6 +1293,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) snd_seq_set_port_info(port, info); snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port); + snd_seq_port_unlock(port); return 0; } diff --git a/sound/core/seq/seq_ports.c b/sound/core/seq/seq_ports.c index 0a7020c82bfc..d21ece9f8d73 100644 --- a/sound/core/seq/seq_ports.c +++ b/sound/core/seq/seq_ports.c @@ -122,7 +122,9 @@ static void port_subs_info_init(struct snd_seq_port_subs_info *grp) } -/* create a port, port number is returned (-1 on failure) */ +/* create a port, port number is returned (-1 on failure); + * the caller needs to unref the port via snd_seq_port_unlock() appropriately + */ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, int port) { @@ -151,6 +153,7 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, snd_use_lock_init(&new_port->use_lock); port_subs_info_init(&new_port->c_src); port_subs_info_init(&new_port->c_dest); + snd_use_lock_use(&new_port->use_lock); num = port >= 0 ? port : 0; mutex_lock(&client->ports_mutex); @@ -165,9 +168,9 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, list_add_tail(&new_port->list, &p->list); client->num_ports++; new_port->addr.port = num; /* store the port number in the port */ + sprintf(new_port->name, "port-%d", num); write_unlock_irqrestore(&client->ports_lock, flags); mutex_unlock(&client->ports_mutex); - sprintf(new_port->name, "port-%d", num); return new_port; } From cda77556447c782b3c9c068f81ef58136cb487c3 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 10 Oct 2017 15:13:55 +0200 Subject: [PATCH 149/217] gpu: ipu-v3: Allow channel burst locking on i.MX6 only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IDMAC_LOCK_EN registers on i.MX51 have a different layout, and on i.MX53 enabling the lock feature causes bursts to get lost. Restrict enabling the burst lock feature to i.MX6. Reported-by: Patrick Brünn Fixes: 790cb4c7c954 ("drm/imx: lock scanout transfers for consecutive bursts") Tested-by: Patrick Brünn Signed-off-by: Philipp Zabel --- drivers/gpu/ipu-v3/ipu-common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c index 6a573d21d3cc..658fa2d3e40c 100644 --- a/drivers/gpu/ipu-v3/ipu-common.c +++ b/drivers/gpu/ipu-v3/ipu-common.c @@ -405,6 +405,14 @@ int ipu_idmac_lock_enable(struct ipuv3_channel *channel, int num_bursts) return -EINVAL; } + /* + * IPUv3EX / i.MX51 has a different register layout, and on IPUv3M / + * i.MX53 channel arbitration locking doesn't seem to work properly. + * Allow enabling the lock feature on IPUv3H / i.MX6 only. + */ + if (bursts && ipu->ipu_type != IPUV3H) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(idmac_lock_en_info); i++) { if (channel->num == idmac_lock_en_info[i].chnum) break; From 263c3b8044f9c9356a34fdb2640b52d27e378f9c Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Mar 2017 18:01:53 +0100 Subject: [PATCH 150/217] gpu: ipu-v3: prg: wait for double buffers to be filled on channel startup Wait for both double buffer to be filled when first starting a channel. This makes channel startup a lot more reliable, probably because it allows the internal state machine to settle before the requests from the IPU are coming in. Signed-off-by: Lucas Stach [p.zabel@pengutronix.de: rebased before switch to runtime PM] Signed-off-by: Philipp Zabel --- drivers/gpu/ipu-v3/ipu-prg.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/ipu-v3/ipu-prg.c b/drivers/gpu/ipu-v3/ipu-prg.c index ecc9ea44dc50..0013ca9f72c8 100644 --- a/drivers/gpu/ipu-v3/ipu-prg.c +++ b/drivers/gpu/ipu-v3/ipu-prg.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -329,6 +330,12 @@ int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan, val = IPU_PRG_REG_UPDATE_REG_UPDATE; writel(val, prg->regs + IPU_PRG_REG_UPDATE); + /* wait for both double buffers to be filled */ + readl_poll_timeout(prg->regs + IPU_PRG_STATUS, val, + (val & IPU_PRG_STATUS_BUFFER0_READY(prg_chan)) && + (val & IPU_PRG_STATUS_BUFFER1_READY(prg_chan)), + 5, 1000); + clk_disable_unprepare(prg->clk_ipg); chan->enabled = true; From 11aff4b4c7c4b7257660ef890920f2ac72911ed0 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 18 Sep 2017 17:45:07 +0200 Subject: [PATCH 151/217] gpu: ipu-v3: pre: implement workaround for ERR009624 The PRE has a bug where a software write to the CTRL register can block the setting of the ENABLE bit by the hardware in auto repeat mode. When this happens the PRE will fail to handle new jobs. To work around this software must not write to CTRL register when the PRE store engine is inside the unsafe window, where a hardware update to the ENABLE bit may happen. Signed-off-by: Lucas Stach [p.zabel@pengutronix.de: rebased before PRE tiled prefetch support] Signed-off-by: Philipp Zabel --- drivers/gpu/ipu-v3/ipu-pre.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/gpu/ipu-v3/ipu-pre.c b/drivers/gpu/ipu-v3/ipu-pre.c index c35f74c83065..c860a7997cb5 100644 --- a/drivers/gpu/ipu-v3/ipu-pre.c +++ b/drivers/gpu/ipu-v3/ipu-pre.c @@ -73,6 +73,14 @@ #define IPU_PRE_STORE_ENG_CTRL_WR_NUM_BYTES(v) ((v & 0x7) << 1) #define IPU_PRE_STORE_ENG_CTRL_OUTPUT_ACTIVE_BPP(v) ((v & 0x3) << 4) +#define IPU_PRE_STORE_ENG_STATUS 0x120 +#define IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_X_MASK 0xffff +#define IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_X_SHIFT 0 +#define IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_Y_MASK 0x3fff +#define IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_Y_SHIFT 16 +#define IPU_PRE_STORE_ENG_STATUS_STORE_FIFO_FULL (1 << 30) +#define IPU_PRE_STORE_ENG_STATUS_STORE_FIELD (1 << 31) + #define IPU_PRE_STORE_ENG_SIZE 0x130 #define IPU_PRE_STORE_ENG_SIZE_INPUT_WIDTH(v) ((v & 0xffff) << 0) #define IPU_PRE_STORE_ENG_SIZE_INPUT_HEIGHT(v) ((v & 0xffff) << 16) @@ -93,6 +101,7 @@ struct ipu_pre { dma_addr_t buffer_paddr; void *buffer_virt; bool in_use; + unsigned int safe_window_end; }; static DEFINE_MUTEX(ipu_pre_list_mutex); @@ -160,6 +169,9 @@ void ipu_pre_configure(struct ipu_pre *pre, unsigned int width, u32 active_bpp = info->cpp[0] >> 1; u32 val; + /* calculate safe window for ctrl register updates */ + pre->safe_window_end = height - 2; + writel(bufaddr, pre->regs + IPU_PRE_CUR_BUF); writel(bufaddr, pre->regs + IPU_PRE_NEXT_BUF); @@ -199,7 +211,24 @@ void ipu_pre_configure(struct ipu_pre *pre, unsigned int width, void ipu_pre_update(struct ipu_pre *pre, unsigned int bufaddr) { + unsigned long timeout = jiffies + msecs_to_jiffies(5); + unsigned short current_yblock; + u32 val; + writel(bufaddr, pre->regs + IPU_PRE_NEXT_BUF); + + do { + if (time_after(jiffies, timeout)) { + dev_warn(pre->dev, "timeout waiting for PRE safe window\n"); + return; + } + + val = readl(pre->regs + IPU_PRE_STORE_ENG_STATUS); + current_yblock = + (val >> IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_Y_SHIFT) & + IPU_PRE_STORE_ENG_STATUS_STORE_BLOCK_Y_MASK; + } while (current_yblock == 0 || current_yblock >= pre->safe_window_end); + writel(IPU_PRE_CTRL_SDW_UPDATE, pre->regs + IPU_PRE_CTRL_SET); } From 203f44c475a1a8ace6d30c4c14ab41295081a23f Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Mon, 2 Oct 2017 12:22:53 +0100 Subject: [PATCH 152/217] usb: phy: tegra: Fix phy suspend for UDC Commit dfebb5f43a78 ("usb: chipidea: Add support for Tegra20/30/114/124") added UDC support for Tegra but with UDC support enabled, is was found that Tegra30, Tegra114 and Tegra124 would hang on entry to suspend. The hang occurred during the suspend of the USB PHY when the Tegra PHY driver attempted to disable the PHY clock. The problem is that before the Tegra PHY driver is suspended, the chipidea driver already disabled the PHY clock and when the Tegra PHY driver suspended, it could not read DEVLC register and caused the device to hang. The Tegra USB PHY driver is used by both the Tegra EHCI driver and now the chipidea UDC driver and so simply removing the disabling of the PHY clock from the USB PHY driver would not work for the Tegra EHCI driver. Fortunately, the status of the USB PHY clock can be read from the USB_SUSP_CTRL register and therefore, to workaround this issue, simply poll the register prior to disabling the clock in USB PHY driver to see if clock gating has already been initiated. Please note that it can take a few uS for the clock to disable and so simply reading this status register once on entry is not sufficient. Similarly when turning on the PHY clock, it is possible that the clock is already enabled or in the process of being enabled, and so check for this when enabling the PHY. Please note that no issues are seen with Tegra20 because it has a slightly different PHY to Tegra30/114/124. Fixes: dfebb5f43a78 ("usb: chipidea: Add support for Tegra20/30/114/124") Reviewed-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Acked-by: Thierry Reding Signed-off-by: Jon Hunter Signed-off-by: Felipe Balbi --- drivers/usb/phy/phy-tegra-usb.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c index 5fe4a5704bde..ccc2bf5274b4 100644 --- a/drivers/usb/phy/phy-tegra-usb.c +++ b/drivers/usb/phy/phy-tegra-usb.c @@ -329,6 +329,14 @@ static void utmi_phy_clk_disable(struct tegra_usb_phy *phy) unsigned long val; void __iomem *base = phy->regs; + /* + * The USB driver may have already initiated the phy clock + * disable so wait to see if the clock turns off and if not + * then proceed with gating the clock. + */ + if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID, 0) == 0) + return; + if (phy->is_legacy_phy) { val = readl(base + USB_SUSP_CTRL); val |= USB_SUSP_SET; @@ -351,6 +359,15 @@ static void utmi_phy_clk_enable(struct tegra_usb_phy *phy) unsigned long val; void __iomem *base = phy->regs; + /* + * The USB driver may have already initiated the phy clock + * enable so wait to see if the clock turns on and if not + * then proceed with ungating the clock. + */ + if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID, + USB_PHY_CLK_VALID) == 0) + return; + if (phy->is_legacy_phy) { val = readl(base + USB_SUSP_CTRL); val |= USB_SUSP_CLR; From ab219221a5064abfff9f78c323c4a257b16cdb81 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 6 Oct 2017 10:27:44 -0400 Subject: [PATCH 153/217] USB: dummy-hcd: Fix deadlock caused by disconnect detection The dummy-hcd driver calls the gadget driver's disconnect callback under the wrong conditions. It should invoke the callback when Vbus power is turned off, but instead it does so when the D+ pullup is turned off. This can cause a deadlock in the composite core when a gadget driver is unregistered: [ 88.361471] ============================================ [ 88.362014] WARNING: possible recursive locking detected [ 88.362580] 4.14.0-rc2+ #9 Not tainted [ 88.363010] -------------------------------------------- [ 88.363561] v4l_id/526 is trying to acquire lock: [ 88.364062] (&(&cdev->lock)->rlock){....}, at: [] composite_disconnect+0x43/0x100 [libcomposite] [ 88.365051] [ 88.365051] but task is already holding lock: [ 88.365826] (&(&cdev->lock)->rlock){....}, at: [] usb_function_deactivate+0x29/0x80 [libcomposite] [ 88.366858] [ 88.366858] other info that might help us debug this: [ 88.368301] Possible unsafe locking scenario: [ 88.368301] [ 88.369304] CPU0 [ 88.369701] ---- [ 88.370101] lock(&(&cdev->lock)->rlock); [ 88.370623] lock(&(&cdev->lock)->rlock); [ 88.371145] [ 88.371145] *** DEADLOCK *** [ 88.371145] [ 88.372211] May be due to missing lock nesting notation [ 88.372211] [ 88.373191] 2 locks held by v4l_id/526: [ 88.373715] #0: (&(&cdev->lock)->rlock){....}, at: [] usb_function_deactivate+0x29/0x80 [libcomposite] [ 88.374814] #1: (&(&dum_hcd->dum->lock)->rlock){....}, at: [] dummy_pullup+0x7d/0xf0 [dummy_hcd] [ 88.376289] [ 88.376289] stack backtrace: [ 88.377726] CPU: 0 PID: 526 Comm: v4l_id Not tainted 4.14.0-rc2+ #9 [ 88.378557] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 88.379504] Call Trace: [ 88.380019] dump_stack+0x86/0xc7 [ 88.380605] __lock_acquire+0x841/0x1120 [ 88.381252] lock_acquire+0xd5/0x1c0 [ 88.381865] ? composite_disconnect+0x43/0x100 [libcomposite] [ 88.382668] _raw_spin_lock_irqsave+0x40/0x54 [ 88.383357] ? composite_disconnect+0x43/0x100 [libcomposite] [ 88.384290] composite_disconnect+0x43/0x100 [libcomposite] [ 88.385490] set_link_state+0x2d4/0x3c0 [dummy_hcd] [ 88.386436] dummy_pullup+0xa7/0xf0 [dummy_hcd] [ 88.387195] usb_gadget_disconnect+0xd8/0x160 [udc_core] [ 88.387990] usb_gadget_deactivate+0xd3/0x160 [udc_core] [ 88.388793] usb_function_deactivate+0x64/0x80 [libcomposite] [ 88.389628] uvc_function_disconnect+0x1e/0x40 [usb_f_uvc] This patch changes the code to test the port-power status bit rather than the port-connect status bit when deciding whether to isue the callback. Signed-off-by: Alan Stern Reported-by: David Tulloh CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index b17618a55f1b..f04e91ef9e7c 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -419,6 +419,7 @@ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) static void set_link_state(struct dummy_hcd *dum_hcd) { struct dummy *dum = dum_hcd->dum; + unsigned int power_bit; dum_hcd->active = 0; if (dum->pullup) @@ -429,17 +430,19 @@ static void set_link_state(struct dummy_hcd *dum_hcd) return; set_link_state_by_speed(dum_hcd); + power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ? + USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 || dum_hcd->active) dum_hcd->resuming = 0; /* Currently !connected or in reset */ - if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0 || + if ((dum_hcd->port_status & power_bit) == 0 || (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) { - unsigned disconnect = USB_PORT_STAT_CONNECTION & + unsigned int disconnect = power_bit & dum_hcd->old_status & (~dum_hcd->port_status); - unsigned reset = USB_PORT_STAT_RESET & + unsigned int reset = USB_PORT_STAT_RESET & (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ From 29c7f3e68eec4ae94d85ad7b5dfdafdb8089f513 Mon Sep 17 00:00:00 2001 From: Kazuya Mizuguchi Date: Mon, 2 Oct 2017 14:01:41 +0900 Subject: [PATCH 154/217] usb: renesas_usbhs: Fix DMAC sequence for receiving zero-length packet The DREQE bit of the DnFIFOSEL should be set to 1 after the DE bit of USB-DMAC on R-Car SoCs is set to 1 after the USB-DMAC received a zero-length packet. Otherwise, a transfer completion interruption of USB-DMAC doesn't happen. Even if the driver changes the sequence, normal operations (transmit/receive without zero-length packet) will not cause any side-effects. So, this patch fixes the sequence anyway. Signed-off-by: Kazuya Mizuguchi [shimoda: revise the commit log] Fixes: e73a9891b3a1 ("usb: renesas_usbhs: add DMAEngine support") Cc: # v3.1+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/fifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 68f26904c316..50285b01da92 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -857,9 +857,9 @@ static void xfer_work(struct work_struct *work) fifo->name, usbhs_pipe_number(pipe), pkt->length, pkt->zero); usbhs_pipe_running(pipe, 1); - usbhsf_dma_start(pipe, fifo); usbhs_pipe_set_trans_count_if_bulk(pipe, pkt->trans); dma_async_issue_pending(chan); + usbhsf_dma_start(pipe, fifo); usbhs_pipe_enable(pipe); xfer_work_end: From cb84f56861eb333af0a5bab475d741b13067c05c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Sep 2017 11:15:29 +0300 Subject: [PATCH 155/217] usb: misc: usbtest: Fix overflow in usbtest_do_ioctl() There used to be a test against "if (param->sglen > MAX_SGLEN)" but it was removed during a refactor. It leads to an integer overflow and a stack overflow in test_queue() if we try to create a too large urbs[] array on the stack. There is a second integer overflow in test_queue() as well if "param->iterations" is too high. I don't immediately see that it's harmful but I've added a check to prevent it and silence the static checker warning. Fixes: 18fc4ebdc705 ("usb: misc: usbtest: Remove timeval usage") Acked-by: Deepa Dinamani Signed-off-by: Dan Carpenter Signed-off-by: Felipe Balbi --- drivers/usb/misc/usbtest.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index eee82ca55b7b..113e38bfe0ef 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -1964,6 +1964,9 @@ test_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param, int status = 0; struct urb *urbs[param->sglen]; + if (!param->sglen || param->iterations > UINT_MAX / param->sglen) + return -EINVAL; + memset(&context, 0, sizeof(context)); context.count = param->iterations * param->sglen; context.dev = dev; @@ -2087,6 +2090,8 @@ usbtest_do_ioctl(struct usb_interface *intf, struct usbtest_param_32 *param) if (param->iterations <= 0) return -EINVAL; + if (param->sglen > MAX_SGLEN) + return -EINVAL; /* * Just a bunch of test cases that every HCD is expected to handle. * From aec17e1e249567e82b26dafbb86de7d07fde8729 Mon Sep 17 00:00:00 2001 From: Andrew Gabbasov Date: Sat, 30 Sep 2017 08:55:55 -0700 Subject: [PATCH 156/217] usb: gadget: composite: Fix use-after-free in usb_composite_overwrite_options KASAN enabled configuration reports an error BUG: KASAN: use-after-free in usb_composite_overwrite_options+... [libcomposite] at addr ... Read of size 1 by task ... when some driver is un-bound and then bound again. For example, this happens with FunctionFS driver when "ffs-test" test application is run several times in a row. If the driver has empty manufacturer ID string in initial static data, it is then replaced with generated string. After driver unbinding the generated string is freed, but the driver data still keep that pointer. And if the driver is then bound again, that pointer is re-used for string emptiness check. The fix is to clean up the driver string data upon its unbinding to drop the pointer to freed memory. Fixes: cc2683c318a5 ("usb: gadget: Provide a default implementation of default manufacturer string") Cc: stable@vger.kernel.org Signed-off-by: Andrew Gabbasov Signed-off-by: Felipe Balbi --- drivers/usb/gadget/composite.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index dd74c99d6ce1..5d061b3d8224 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -2026,6 +2026,8 @@ static DEVICE_ATTR_RO(suspended); static void __composite_unbind(struct usb_gadget *gadget, bool unbind_driver) { struct usb_composite_dev *cdev = get_gadget_data(gadget); + struct usb_gadget_strings *gstr = cdev->driver->strings[0]; + struct usb_string *dev_str = gstr->strings; /* composite_disconnect() must already have been called * by the underlying peripheral controller driver! @@ -2045,6 +2047,9 @@ static void __composite_unbind(struct usb_gadget *gadget, bool unbind_driver) composite_dev_cleanup(cdev); + if (dev_str[USB_GADGET_MANUFACTURER_IDX].s == cdev->def_manufacturer) + dev_str[USB_GADGET_MANUFACTURER_IDX].s = ""; + kfree(cdev->def_manufacturer); kfree(cdev); set_gadget_data(gadget, NULL); From ff74745e6d3d97a865eda8c1f3fd29c13b79f0cc Mon Sep 17 00:00:00 2001 From: Andrew Gabbasov Date: Sat, 30 Sep 2017 08:54:52 -0700 Subject: [PATCH 157/217] usb: gadget: configfs: Fix memory leak of interface directory data Kmemleak checking configuration reports a memory leak in usb_os_desc_prepare_interf_dir function when rndis function instance is freed and then allocated again. For example, this happens with FunctionFS driver with RNDIS function enabled when "ffs-test" test application is run several times in a row. The data for intermediate "os_desc" group for interface directories is allocated as a single VLA chunk and (after a change of default groups handling) is not ever freed and actually not stored anywhere besides inside a list of default groups of a parent group. The fix is to make usb_os_desc_prepare_interf_dir function return a pointer to allocated data (as a pointer to the first VLA item) instead of (an unused) integer and to make the caller component (currently the only one is RNDIS function) responsible for storing the pointer and freeing the memory when appropriate. Fixes: 1ae1602de028 ("configfs: switch ->default groups to a linked list") Cc: stable@vger.kernel.org Signed-off-by: Andrew Gabbasov Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 15 ++++++++------- drivers/usb/gadget/configfs.h | 11 ++++++----- drivers/usb/gadget/function/f_rndis.c | 12 ++++++++++-- drivers/usb/gadget/function/u_rndis.h | 1 + 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index a22a892de7b7..aeb9f3c40521 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1143,11 +1143,12 @@ static struct configfs_attribute *interf_grp_attrs[] = { NULL }; -int usb_os_desc_prepare_interf_dir(struct config_group *parent, - int n_interf, - struct usb_os_desc **desc, - char **names, - struct module *owner) +struct config_group *usb_os_desc_prepare_interf_dir( + struct config_group *parent, + int n_interf, + struct usb_os_desc **desc, + char **names, + struct module *owner) { struct config_group *os_desc_group; struct config_item_type *os_desc_type, *interface_type; @@ -1159,7 +1160,7 @@ int usb_os_desc_prepare_interf_dir(struct config_group *parent, char *vlabuf = kzalloc(vla_group_size(data_chunk), GFP_KERNEL); if (!vlabuf) - return -ENOMEM; + return ERR_PTR(-ENOMEM); os_desc_group = vla_ptr(vlabuf, data_chunk, os_desc_group); os_desc_type = vla_ptr(vlabuf, data_chunk, os_desc_type); @@ -1184,7 +1185,7 @@ int usb_os_desc_prepare_interf_dir(struct config_group *parent, configfs_add_default_group(&d->group, os_desc_group); } - return 0; + return os_desc_group; } EXPORT_SYMBOL(usb_os_desc_prepare_interf_dir); diff --git a/drivers/usb/gadget/configfs.h b/drivers/usb/gadget/configfs.h index 36c468c4f5e9..540d5e92ed22 100644 --- a/drivers/usb/gadget/configfs.h +++ b/drivers/usb/gadget/configfs.h @@ -5,11 +5,12 @@ void unregister_gadget_item(struct config_item *item); -int usb_os_desc_prepare_interf_dir(struct config_group *parent, - int n_interf, - struct usb_os_desc **desc, - char **names, - struct module *owner); +struct config_group *usb_os_desc_prepare_interf_dir( + struct config_group *parent, + int n_interf, + struct usb_os_desc **desc, + char **names, + struct module *owner); static inline struct usb_os_desc *to_usb_os_desc(struct config_item *item) { diff --git a/drivers/usb/gadget/function/f_rndis.c b/drivers/usb/gadget/function/f_rndis.c index e1d5853ef1e4..c7c5b3ce1d98 100644 --- a/drivers/usb/gadget/function/f_rndis.c +++ b/drivers/usb/gadget/function/f_rndis.c @@ -908,6 +908,7 @@ static void rndis_free_inst(struct usb_function_instance *f) free_netdev(opts->net); } + kfree(opts->rndis_interf_group); /* single VLA chunk */ kfree(opts); } @@ -916,6 +917,7 @@ static struct usb_function_instance *rndis_alloc_inst(void) struct f_rndis_opts *opts; struct usb_os_desc *descs[1]; char *names[1]; + struct config_group *rndis_interf_group; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) @@ -940,8 +942,14 @@ static struct usb_function_instance *rndis_alloc_inst(void) names[0] = "rndis"; config_group_init_type_name(&opts->func_inst.group, "", &rndis_func_type); - usb_os_desc_prepare_interf_dir(&opts->func_inst.group, 1, descs, - names, THIS_MODULE); + rndis_interf_group = + usb_os_desc_prepare_interf_dir(&opts->func_inst.group, 1, descs, + names, THIS_MODULE); + if (IS_ERR(rndis_interf_group)) { + rndis_free_inst(&opts->func_inst); + return ERR_CAST(rndis_interf_group); + } + opts->rndis_interf_group = rndis_interf_group; return &opts->func_inst; } diff --git a/drivers/usb/gadget/function/u_rndis.h b/drivers/usb/gadget/function/u_rndis.h index a35ee3c2545d..efdb7ac381d9 100644 --- a/drivers/usb/gadget/function/u_rndis.h +++ b/drivers/usb/gadget/function/u_rndis.h @@ -26,6 +26,7 @@ struct f_rndis_opts { bool bound; bool borrowed_net; + struct config_group *rndis_interf_group; struct usb_os_desc rndis_os_desc; char rndis_ext_compat_id[16]; From 7c80f9e4a588f1925b07134bb2e3689335f6c6d8 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 29 Sep 2017 10:54:24 -0400 Subject: [PATCH 158/217] usb: usbtest: fix NULL pointer dereference If the usbtest driver encounters a device with an IN bulk endpoint but no OUT bulk endpoint, it will try to dereference a NULL pointer (out->desc.bEndpointAddress). The problem can be solved by adding a missing test. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Felipe Balbi --- drivers/usb/misc/usbtest.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index 113e38bfe0ef..b3fc602b2e24 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -202,12 +202,13 @@ get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf) return tmp; } - if (in) { + if (in) dev->in_pipe = usb_rcvbulkpipe(udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); + if (out) dev->out_pipe = usb_sndbulkpipe(udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); - } + if (iso_in) { dev->iso_in = &iso_in->desc; dev->in_iso_pipe = usb_rcvisocpipe(udev, From ef8daf8eeb5b8ab6bc356656163d19f20fb827ed Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Mon, 2 Oct 2017 11:56:48 -0400 Subject: [PATCH 159/217] livepatch: unpatch all klp_objects if klp_module_coming fails When an incoming module is considered for livepatching by klp_module_coming(), it iterates over multiple patches and multiple kernel objects in this order: list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { which means that if one of the kernel objects fails to patch, klp_module_coming()'s error path needs to unpatch and cleanup any kernel objects that were already patched by a previous patch. Reported-by: Miroslav Benes Suggested-by: Petr Mladek Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 60 +++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..bf8c8fd72589 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); +/* + * Remove parts of patches that touch a given kernel module. The list of + * patches processed might be limited. When limit is NULL, all patches + * will be handled. + */ +static void klp_cleanup_module_patches_limited(struct module *mod, + struct klp_patch *limit) +{ + struct klp_patch *patch; + struct klp_object *obj; + + list_for_each_entry(patch, &klp_patches, list) { + if (patch == limit) + break; + + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; + + /* + * Only unpatch the module if the patch is enabled or + * is in transition. + */ + if (patch->enabled || patch == klp_transition_patch) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); + } + + klp_free_object_loaded(obj); + break; + } + } +} + int klp_module_coming(struct module *mod) { int ret; @@ -894,7 +929,7 @@ int klp_module_coming(struct module *mod) pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; - klp_free_object_loaded(obj); + klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); return ret; @@ -902,9 +937,6 @@ int klp_module_coming(struct module *mod) void klp_module_going(struct module *mod) { - struct klp_patch *patch; - struct klp_object *obj; - if (WARN_ON(mod->state != MODULE_STATE_GOING && mod->state != MODULE_STATE_COMING)) return; @@ -917,25 +949,7 @@ void klp_module_going(struct module *mod) */ mod->klp_alive = false; - list_for_each_entry(patch, &klp_patches, list) { - klp_for_each_object(patch, obj) { - if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) - continue; - - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); - } - - klp_free_object_loaded(obj); - break; - } - } + klp_cleanup_module_patches_limited(mod, NULL); mutex_unlock(&klp_mutex); } From f043bfc98c193c284e2cd768fefabe18ac2fed9b Mon Sep 17 00:00:00 2001 From: Jaejoong Kim Date: Thu, 28 Sep 2017 19:16:30 +0900 Subject: [PATCH 160/217] HID: usbhid: fix out-of-bounds bug The hid descriptor identifies the length and type of subordinate descriptors for a device. If the received hid descriptor is smaller than the size of the struct hid_descriptor, it is possible to cause out-of-bounds. In addition, if bNumDescriptors of the hid descriptor have an incorrect value, this can also cause out-of-bounds while approaching hdesc->desc[n]. So check the size of hid descriptor and bNumDescriptors. BUG: KASAN: slab-out-of-bounds in usbhid_parse+0x9b1/0xa20 Read of size 1 at addr ffff88006c5f8edf by task kworker/1:2/1261 CPU: 1 PID: 1261 Comm: kworker/1:2 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #169 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 usbhid_parse+0x9b1/0xa20 drivers/hid/usbhid/hid-core.c:1004 hid_add_device+0x16b/0xb30 drivers/hid/hid-core.c:2944 usbhid_probe+0xc28/0x1100 drivers/hid/usbhid/hid-core.c:1369 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 Cc: stable@vger.kernel.org Reported-by: Andrey Konovalov Signed-off-by: Jaejoong Kim Tested-by: Andrey Konovalov Acked-by: Alan Stern Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 089bad8a9a21..045b5da9b992 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -975,6 +975,8 @@ static int usbhid_parse(struct hid_device *hid) unsigned int rsize = 0; char *rdesc; int ret, n; + int num_descriptors; + size_t offset = offsetof(struct hid_descriptor, desc); quirks = usbhid_lookup_quirk(le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); @@ -997,10 +999,18 @@ static int usbhid_parse(struct hid_device *hid) return -ENODEV; } + if (hdesc->bLength < sizeof(struct hid_descriptor)) { + dbg_hid("hid descriptor is too short\n"); + return -EINVAL; + } + hid->version = le16_to_cpu(hdesc->bcdHID); hid->country = hdesc->bCountryCode; - for (n = 0; n < hdesc->bNumDescriptors; n++) + num_descriptors = min_t(int, hdesc->bNumDescriptors, + (hdesc->bLength - offset) / sizeof(struct hid_class_descriptor)); + + for (n = 0; n < num_descriptors; n++) if (hdesc->desc[n].bDescriptorType == HID_DT_REPORT) rsize = le16_to_cpu(hdesc->desc[n].wDescriptorLength); From a0933a456ff83a3b5ffa3a1903e0b8de4a56adf5 Mon Sep 17 00:00:00 2001 From: Alex Manoussakis Date: Thu, 5 Oct 2017 13:41:20 -0400 Subject: [PATCH 161/217] HID: hid-elecom: extend to fix descriptor for HUGE trackball In addition to DEFT, Elecom introduced a larger trackball called HUGE, in both wired (M-HT1URBK) and wireless (M-HT1DRBK) versions. It has the same buttons and behavior as the DEFT. This patch adds the two relevant USB IDs to enable operation of the three Fn buttons on the top of the device. Cc: Diego Elio Petteno Signed-off-by: Alex Manoussakis Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 1 + drivers/hid/hid-core.c | 2 ++ drivers/hid/hid-elecom.c | 13 +++++++++---- drivers/hid/hid-ids.h | 2 ++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 0a3117cc29e7..374301fcbc86 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -281,6 +281,7 @@ config HID_ELECOM Support for ELECOM devices: - BM084 Bluetooth Mouse - DEFT Trackball (Wired and wireless) + - HUGE Trackball (Wired and wireless) config HID_ELO tristate "ELO USB 4000/4500 touchscreen" diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 9bc91160819b..330ca983828b 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2032,6 +2032,8 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRED) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRELESS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_HUGE_WIRED) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_HUGE_WIRELESS) }, #endif #if IS_ENABLED(CONFIG_HID_ELO) { HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0009) }, diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c index e2c7465df69f..54aeea57d209 100644 --- a/drivers/hid/hid-elecom.c +++ b/drivers/hid/hid-elecom.c @@ -3,6 +3,7 @@ * Copyright (c) 2010 Richard Nauber * Copyright (c) 2016 Yuxuan Shui * Copyright (c) 2017 Diego Elio Pettenò + * Copyright (c) 2017 Alex Manoussakis */ /* @@ -32,9 +33,11 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, break; case USB_DEVICE_ID_ELECOM_DEFT_WIRED: case USB_DEVICE_ID_ELECOM_DEFT_WIRELESS: - /* The DEFT trackball has eight buttons, but its descriptor only - * reports five, disabling the three Fn buttons on the top of - * the mouse. + case USB_DEVICE_ID_ELECOM_HUGE_WIRED: + case USB_DEVICE_ID_ELECOM_HUGE_WIRELESS: + /* The DEFT/HUGE trackball has eight buttons, but its descriptor + * only reports five, disabling the three Fn buttons on the top + * of the mouse. * * Apply the following diff to the descriptor: * @@ -62,7 +65,7 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, * End Collection, End Collection, */ if (*rsize == 213 && rdesc[13] == 5 && rdesc[21] == 5) { - hid_info(hdev, "Fixing up Elecom DEFT Fn buttons\n"); + hid_info(hdev, "Fixing up Elecom DEFT/HUGE Fn buttons\n"); rdesc[13] = 8; /* Button/Variable Report Count */ rdesc[21] = 8; /* Button/Variable Usage Maximum */ rdesc[29] = 0; /* Button/Constant Report Count */ @@ -76,6 +79,8 @@ static const struct hid_device_id elecom_devices[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRED) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRELESS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_HUGE_WIRED) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_HUGE_WIRELESS) }, { } }; MODULE_DEVICE_TABLE(hid, elecom_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index a98919199858..be2e005c3c51 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -368,6 +368,8 @@ #define USB_DEVICE_ID_ELECOM_BM084 0x0061 #define USB_DEVICE_ID_ELECOM_DEFT_WIRED 0x00fe #define USB_DEVICE_ID_ELECOM_DEFT_WIRELESS 0x00ff +#define USB_DEVICE_ID_ELECOM_HUGE_WIRED 0x010c +#define USB_DEVICE_ID_ELECOM_HUGE_WIRELESS 0x010d #define USB_VENDOR_ID_DREAM_CHEEKY 0x1d34 #define USB_DEVICE_ID_DREAM_CHEEKY_WN 0x0004 From 99fee508245825765ff60155fed43f970ff83a8f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Oct 2017 16:39:02 +0200 Subject: [PATCH 162/217] ALSA: caiaq: Fix stray URB at probe error path caiaq driver doesn't kill the URB properly at its error path during the probe, which may lead to a use-after-free error later. This patch addresses it. Reported-by: Johan Hovold Reviewed-by: Johan Hovold Cc: Signed-off-by: Takashi Iwai --- sound/usb/caiaq/device.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/usb/caiaq/device.c b/sound/usb/caiaq/device.c index 0fb6b1b79261..d8409d9ae55b 100644 --- a/sound/usb/caiaq/device.c +++ b/sound/usb/caiaq/device.c @@ -469,10 +469,12 @@ static int init_card(struct snd_usb_caiaqdev *cdev) err = snd_usb_caiaq_send_command(cdev, EP1_CMD_GET_DEVICE_INFO, NULL, 0); if (err) - return err; + goto err_kill_urb; - if (!wait_event_timeout(cdev->ep1_wait_queue, cdev->spec_received, HZ)) - return -ENODEV; + if (!wait_event_timeout(cdev->ep1_wait_queue, cdev->spec_received, HZ)) { + err = -ENODEV; + goto err_kill_urb; + } usb_string(usb_dev, usb_dev->descriptor.iManufacturer, cdev->vendor_name, CAIAQ_USB_STR_LEN); @@ -507,6 +509,10 @@ static int init_card(struct snd_usb_caiaqdev *cdev) setup_card(cdev); return 0; + + err_kill_urb: + usb_kill_urb(&cdev->ep1_in_urb); + return err; } static int snd_probe(struct usb_interface *intf, From 67bb8e999e0aeac285d22f0e53c856b9df5282c6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 10 Oct 2017 14:45:04 -0500 Subject: [PATCH 163/217] x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c Some routines in mem_encrypt.c are called very early in the boot process, e.g. sme_enable(). When CONFIG_KCOV=y is defined the resulting code added to sme_enable() (and others) for KCOV instrumentation results in a kernel crash. Disable the KCOV instrumentation for mem_encrypt.c by adding KCOV_INSTRUMENT_mem_encrypt.o := n to arch/x86/mm/Makefile. In order to avoid other possible early boot issues, model mem_encrypt.c after head64.c in regards to tools. In addition to disabling KCOV as stated above and a previous patch that disables branch profiling, also remove the "-pg" CFLAG if CONFIG_FUNCTION_TRACER is enabled and set KASAN_SANITIZE to "n", each of which are done on a file basis. Reported-by: kernel test robot Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171010194504.18887.38053.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 72bf8c01c6e3..e1f095884386 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,12 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n + +KASAN_SANITIZE_mem_encrypt.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o setup_nx.o tlb.o From 56ae414e9d2718bcbfda9ba3797c39005e2f90fb Mon Sep 17 00:00:00 2001 From: Alexander Levin Date: Mon, 10 Apr 2017 18:46:51 +0000 Subject: [PATCH 164/217] 9p: set page uptodate when required in write_end() Commit 77469c3f570 prevented setting the page as uptodate when we wrote the right amount of data, fix that. Fixes: 77469c3f570 ("9p: saner ->write_end() on failing copy into non-uptodate page") Reviewed-by: Jan Kara Signed-off-by: Alexander Levin Signed-off-by: Linus Torvalds --- fs/9p/vfs_addr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index adaf6f6dd858..e1cbdfdb7c68 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -310,9 +310,13 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (unlikely(copied < len && !PageUptodate(page))) { - copied = 0; - goto out; + if (!PageUptodate(page)) { + if (unlikely(copied < len)) { + copied = 0; + goto out; + } else if (len == PAGE_SIZE) { + SetPageUptodate(page); + } } /* * No need to use i_size_read() here, the i_size From 20413e37d71befd02b5846acdaf5e2564dd1c38e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Oct 2017 11:37:22 -0700 Subject: [PATCH 165/217] xfs: Don't log uninitialised fields in inode structures Prevent kmemcheck from throwing warnings about reading uninitialised memory when formatting inodes into the incore log buffer. There are several issues here - we don't always log all the fields in the inode log format item, and we never log the inode the di_next_unlinked field. In the case of the inode log format item, this is exacerbated by the old xfs_inode_log_format structure padding issue. Hence make the padded, 64 bit aligned version of the structure the one we always use for formatting the log and get rid of the 64 bit variant. This means we'll always log the 64-bit version and so recovery only needs to convert from the unpadded 32 bit version from older 32 bit kernels. Signed-Off-By: Dave Chinner Tested-by: Tetsuo Handa Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_format.h | 27 ++++-------- fs/xfs/xfs_inode_item.c | 75 ++++++++++++++++++---------------- fs/xfs/xfs_ondisk.h | 2 +- 3 files changed, 48 insertions(+), 56 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8372e9bcd7b6..71de185735e0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format { uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ + uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ @@ -280,7 +281,12 @@ typedef struct xfs_inode_log_format { int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; -typedef struct xfs_inode_log_format_32 { +/* + * Old 32 bit systems will log in this format without the 64 bit + * alignment padding. Recovery will detect this and convert it to the + * correct format. + */ +struct xfs_inode_log_format_32 { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ @@ -294,24 +300,7 @@ typedef struct xfs_inode_log_format_32 { int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { - uint16_t ilf_type; /* inode log item type */ - uint16_t ilf_size; /* size of this item */ - uint32_t ilf_fields; /* flags for fields logged */ - uint16_t ilf_asize; /* size of attr d/ext/root */ - uint16_t ilf_dsize; /* size of data/ext/root */ - uint32_t ilf_pad; /* pad for 64 bit boundary */ - uint64_t ilf_ino; /* inode number */ - union { - uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - int64_t ilf_blkno; /* blkno of inode buffer */ - int32_t ilf_len; /* len of inode buffer */ - int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; +} __attribute__((packed)); /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a705f34b58fa..9bbc2d7cc8cb 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -364,6 +364,9 @@ xfs_inode_to_log_dinode( to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; + /* log a dummy value to ensure log structure is fully initialised */ + to->di_next_unlinked = NULLAGINO; + if (from->di_version == 3) { to->di_changecount = inode->i_version; to->di_crtime.t_sec = from->di_crtime.t_sec; @@ -404,6 +407,11 @@ xfs_inode_item_format_core( * the second with the on-disk inode structure, and a possible third and/or * fourth with the inode data/extents/b-tree root and inode attributes * data/extents/b-tree root. + * + * Note: Always use the 64 bit inode log format structure so we don't + * leave an uninitialised hole in the format item on 64 bit systems. Log + * recovery on 32 bit systems handles this just fine, so there's no reason + * for not using an initialising the properly padded structure all the time. */ STATIC void xfs_inode_item_format( @@ -412,8 +420,8 @@ xfs_inode_item_format( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_inode_log_format *ilf; struct xfs_log_iovec *vecp = NULL; + struct xfs_inode_log_format *ilf; ASSERT(ip->i_d.di_version > 1); @@ -425,7 +433,17 @@ xfs_inode_item_format( ilf->ilf_boffset = ip->i_imap.im_boffset; ilf->ilf_fields = XFS_ILOG_CORE; ilf->ilf_size = 2; /* format + core */ - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + /* + * make sure we don't leak uninitialised data into the log in the case + * when we don't log every field in the inode. + */ + ilf->ilf_dsize = 0; + ilf->ilf_asize = 0; + ilf->ilf_pad = 0; + uuid_copy(&ilf->ilf_u.ilfu_uuid, &uuid_null); + + xlog_finish_iovec(lv, vecp, sizeof(*ilf)); xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); @@ -855,44 +873,29 @@ xfs_istale_done( } /* - * convert an xfs_inode_log_format struct from either 32 or 64 bit versions - * (which can have different field alignments) to the native version + * convert an xfs_inode_log_format struct from the old 32 bit version + * (which can have different field alignments) to the native 64 bit version */ int xfs_inode_item_format_convert( - xfs_log_iovec_t *buf, - xfs_inode_log_format_t *in_f) + struct xfs_log_iovec *buf, + struct xfs_inode_log_format *in_f) { - if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32 = buf->i_addr; + struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; - in_f->ilf_type = in_f32->ilf_type; - in_f->ilf_size = in_f32->ilf_size; - in_f->ilf_fields = in_f32->ilf_fields; - in_f->ilf_asize = in_f32->ilf_asize; - in_f->ilf_dsize = in_f32->ilf_dsize; - in_f->ilf_ino = in_f32->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f32->ilf_blkno; - in_f->ilf_len = in_f32->ilf_len; - in_f->ilf_boffset = in_f32->ilf_boffset; - return 0; - } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64 = buf->i_addr; + if (buf->i_len != sizeof(*in_f32)) + return -EFSCORRUPTED; - in_f->ilf_type = in_f64->ilf_type; - in_f->ilf_size = in_f64->ilf_size; - in_f->ilf_fields = in_f64->ilf_fields; - in_f->ilf_asize = in_f64->ilf_asize; - in_f->ilf_dsize = in_f64->ilf_dsize; - in_f->ilf_ino = in_f64->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f64->ilf_blkno; - in_f->ilf_len = in_f64->ilf_len; - in_f->ilf_boffset = in_f64->ilf_boffset; - return 0; - } - return -EFSCORRUPTED; + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; } diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0c381d71b242..0492436a053f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); - XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); } From bb9c2e5433250f5b477035dc478314f8e6dd5e36 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Oct 2017 11:37:22 -0700 Subject: [PATCH 166/217] xfs: move more RT specific code under CONFIG_XFS_RT Various utility functions and interfaces that iterate internal devices try to reference the realtime device even when RT support is not compiled into the kernel. Make sure this code is excluded from the CONFIG_XFS_RT=n build, and where appropriate stub functions to return fatal errors if they ever get called when RT support is not present. Signed-Off-By: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 2 ++ fs/xfs/xfs_bmap_util.h | 13 +++++++++++++ fs/xfs/xfs_fsmap.c | 12 ++++++++++++ 3 files changed, 27 insertions(+) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e9db7fc95b70..6503cfa44262 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -84,6 +84,7 @@ xfs_zero_extent( GFP_NOFS, 0); } +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -190,6 +191,7 @@ xfs_bmap_rtalloc( } return 0; } +#endif /* CONFIG_XFS_RT */ /* * Check if the endoff is outside the last extent. If so the caller will grow diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0eaa81dc49be..7d330b3c77c3 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -28,7 +28,20 @@ struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +#else /* !CONFIG_XFS_RT */ +/* + * Attempts to allocate RT extents when RT is disable indicates corruption and + * should trigger a shutdown. + */ +static inline int +xfs_bmap_rtalloc(struct xfs_bmalloca *ap) +{ + return -EFSCORRUPTED; +} +#endif /* CONFIG_XFS_RT */ + int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 814ed729881d..560e0b40ac1b 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -521,6 +521,7 @@ __xfs_getfsmap_rtdev( return query_fn(tp, info); } +#ifdef CONFIG_XFS_RT /* Actually query the realtime bitmap. */ STATIC int xfs_getfsmap_rtdev_rtbitmap_query( @@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap( return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, info); } +#endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ STATIC int @@ -795,7 +797,15 @@ xfs_getfsmap_check_keys( return false; } +/* + * There are only two devices if we didn't configure RT devices at build time. + */ +#ifdef CONFIG_XFS_RT #define XFS_GETFSMAP_DEVS 3 +#else +#define XFS_GETFSMAP_DEVS 2 +#endif /* CONFIG_XFS_RT */ + /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all @@ -853,10 +863,12 @@ xfs_getfsmap( handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } +#ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } +#endif /* CONFIG_XFS_RT */ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), xfs_getfsmap_dev_compare); From 67f2ffe31d1a683170c2ba0ecc643e42a5fdd397 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Oct 2017 11:37:23 -0700 Subject: [PATCH 167/217] xfs: don't change inode mode if ACL update fails If we get ENOSPC half way through setting the ACL, the inode mode can still be changed even though the ACL does not exist. Reorder the operation to only change the mode of the inode if the ACL is set correctly. Whilst this does not fix the problem with crash consistency (that requires attribute addition to be a deferred op) it does prevent ENOSPC and other non-fatal errors setting an xattr to be handled sanely. This fixes xfstests generic/449. Signed-Off-By: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_acl.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 7034e17535de..3354140de07e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode) int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + umode_t mode; + bool set_mode = false; int error = 0; if (!acl) @@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode; - error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - error = xfs_set_mode(inode, mode); - if (error) - return error; + set_mode = true; } set_acl: - return __xfs_set_acl(inode, acl, type); + error = __xfs_set_acl(inode, acl, type); + if (error) + return error; + + /* + * We set the mode after successfully updating the ACL xattr because the + * xattr update can fail at ENOSPC and we don't want to change the mode + * if the ACL update hasn't been applied. + */ + if (set_mode) + error = xfs_set_mode(inode, mode); + + return error; } From 749f24f33e87c1706d716c283027595b72a034f3 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 9 Oct 2017 11:38:54 -0700 Subject: [PATCH 168/217] xfs: Fix bool initialization/comparison Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_ialloc.c | 4 ++-- fs/xfs/xfs_file.c | 4 ++-- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 044a363119be..def32fa1c225 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1477,14 +1477,14 @@ xfs_bmap_isaeof( int is_empty; int error; - bma->aeof = 0; + bma->aeof = false; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); if (error) return error; if (is_empty) { - bma->aeof = 1; + bma->aeof = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988bb3f31446..dfd643909f85 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1962,7 +1962,7 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - xic->deleted = 1; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -1989,7 +1989,7 @@ xfs_difree_inobt( xfs_difree_inode_chunk(mp, agno, &rec, dfops); } else { - xic->deleted = 0; + xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..56d0e526870c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -764,7 +764,7 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; - bool do_file_insert = 0; + bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -825,7 +825,7 @@ xfs_file_fallocate( error = -EINVAL; goto out_unlock; } - do_file_insert = 1; + do_file_insert = true; } else { flags |= XFS_PREALLOC_SET; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c5107c7bc4bf..dc95a49d62e7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2515,7 +2515,7 @@ xlog_write( if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0 && ordered == false) { + if (record_cnt == 0 && !ordered) { if (!lv) return 0; break; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ea7d4b4e50d0..e9727d0a541a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -704,7 +704,7 @@ xfs_mountfs( xfs_set_maxicount(mp); /* enable fail_at_unmount as default */ - mp->m_fail_unmount = 1; + mp->m_fail_unmount = true; error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) From f35c5e10c6ed6ba52a8dd8573924a80b6a02f03f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 9 Oct 2017 11:38:56 -0700 Subject: [PATCH 169/217] xfs: reinit btree pointer on attr tree inactivation walk xfs_attr3_root_inactive() walks the attr fork tree to invalidate the associated blocks. xfs_attr3_node_inactive() recursively descends from internal blocks to leaf blocks, caching block address values along the way to revisit parent blocks, locate the next entry and descend down that branch of the tree. The code that attempts to reread the parent block is unsafe because it assumes that the local xfs_da_node_entry pointer remains valid after an xfs_trans_brelse() and re-read of the parent buffer. Under heavy memory pressure, it is possible that the buffer has been reclaimed and reallocated by the time the parent block is reread. This means that 'btree' can point to an invalid memory address, lead to a random/garbage value for child_fsb and cause the subsequent read of the attr fork to go off the rails and return a NULL buffer for an attr fork offset that is most likely not allocated. Note that this problem can be manufactured by setting XFS_ATTR_BTREE_REF to 0 to prevent LRU caching of attr buffers, creating a file with a multi-level attr fork and removing it to trigger inactivation. To address this problem, reinit the node/btree pointers to the parent buffer after it has been re-read. This ensures btree points to a valid record and allows the walk to proceed. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_attr_inactive.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index ebd66b19fbfc..e3a950ed35a8 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -302,6 +302,8 @@ xfs_attr3_node_inactive( &bp, XFS_ATTR_FORK); if (error) return error; + node = bp->b_addr; + btree = dp->d_ops->node_tree_p(node); child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } From 93e8befc17f6d6ea92b0aee3741ceac8bca4590f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 9 Oct 2017 21:08:06 -0700 Subject: [PATCH 170/217] xfs: handle error if xfs_btree_get_bufs fails Jason reported that a corrupted filesystem failed to replay the log with a metadata block out of bounds warning: XFS (dm-2): _xfs_buf_find: Block out of range: block 0x80270fff8, EOFS 0x9c40000 _xfs_buf_find() and xfs_btree_get_bufs() return NULL if that happens, and then when xfs_alloc_fix_freelist() calls xfs_trans_binval() on that NULL bp, we oops with: BUG: unable to handle kernel NULL pointer dereference at 00000000000000f8 We don't handle _xfs_buf_find errors very well, every caller higher up the stack gets to guess at why it failed. But we should at least handle it somehow, so return EFSCORRUPTED here. Reported-by: Jason L Tibbitts III Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 744dcaec34cc..f965ce832bc0 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1584,6 +1584,10 @@ xfs_alloc_ag_vextent_small( bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto error0; + } xfs_trans_binval(args->tp, bp); } args->len = 1; @@ -2141,6 +2145,10 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto out_agbp_relse; + } xfs_trans_binval(tp, bp); } From ea850f64c2722278f150dc11de2141baeb24211c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 28 Sep 2017 11:21:57 +0300 Subject: [PATCH 171/217] drm/i915/bios: parse DDI ports also for CHV for HDMI DDC pin and DP AUX channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While technically CHV isn't DDI, we do look at the VBT based DDI port info for HDMI DDC pin and DP AUX channel. (We call these "alternate", but they're really just something that aren't platform defaults.) In commit e4ab73a13291 ("drm/i915: Respect alternate_ddc_pin for all DDI ports") Ville writes, "IIRC there may be CHV system that might actually need this." I'm not sure why there couldn't be even more platforms that need this, but start conservative, and parse the info for CHV in addition to DDI. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100553 Reported-by: Marek Wilczewski Cc: stable@vger.kernel.org Reviewed-by: Ville Syrjälä Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/d0815082cb98487618429b62414854137049b888.1506586821.git.jani.nikula@intel.com (cherry picked from commit 348e4058ebf53904e817eec7a1b25327143c2ed2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_bios.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 00c6aee0a9a1..5d4cd3d00564 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1240,7 +1240,7 @@ static void parse_ddi_ports(struct drm_i915_private *dev_priv, { enum port port; - if (!HAS_DDI(dev_priv)) + if (!HAS_DDI(dev_priv) && !IS_CHERRYVIEW(dev_priv)) return; if (!dev_priv->vbt.child_dev_num) From 68a39a3e9fe1d6f0dfe59e3f4d6bc6765e01c903 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 11 Oct 2017 10:48:44 +0000 Subject: [PATCH 172/217] remoteproc: imx_rproc: fix return value check in imx_rproc_addr_init() In case of error, the function devm_ioremap() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Reviewed-by: Oleksij Rempel Signed-off-by: Wei Yongjun Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 81ba44510b75..633268e9d550 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -269,10 +269,9 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, att->sa, att->size); - if (IS_ERR(priv->mem[b].cpu_addr)) { + if (!priv->mem[b].cpu_addr) { dev_err(dev, "devm_ioremap_resource failed\n"); - err = PTR_ERR(priv->mem[b].cpu_addr); - return err; + return -ENOMEM; } priv->mem[b].sys_addr = att->sa; priv->mem[b].size = att->size; From c343bc2ce2c627b6cef2b09794a4a5b63419a798 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 26 Sep 2017 12:08:27 +0300 Subject: [PATCH 173/217] ACPI: properties: Align return codes of __acpi_node_get_property_reference() acpi_fwnode_get_reference_args(), the function implementing ACPI support for fwnode_property_get_reference_args(), returns directly error codes from __acpi_node_get_property_reference(). The latter uses different error codes than the OF implementation. In particular, the OF implementation uses -ENOENT to indicate that the property is not found, a reference entry is empty and there are no more references. Document and align the error codes for property for fwnode_property_get_reference_args() so that they match with of_parse_phandle_with_args(). Fixes: 3e3119d3088f (device property: Introduce fwnode_property_get_reference_args) Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 19 +++++++++---------- drivers/base/property.c | 4 ++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 3fb8ff513461..5a8ac5e1081b 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -571,10 +571,9 @@ static int acpi_data_get_property_array(const struct acpi_device_data *data, * } * } * - * Calling this function with index %2 return %-ENOENT and with index %3 - * returns the last entry. If the property does not contain any more values - * %-ENODATA is returned. The NULL entry must be single integer and - * preferably contain value %0. + * Calling this function with index %2 or index %3 return %-ENOENT. If the + * property does not contain any more values %-ENOENT is returned. The NULL + * entry must be single integer and preferably contain value %0. * * Return: %0 on success, negative error code on failure. */ @@ -590,7 +589,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, data = acpi_device_data_of_node(fwnode); if (!data) - return -EINVAL; + return -ENOENT; ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) @@ -635,7 +634,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, ret = acpi_bus_get_device(element->reference.handle, &device); if (ret) - return -ENODEV; + return -EINVAL; nargs = 0; element++; @@ -649,11 +648,11 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, else if (type == ACPI_TYPE_LOCAL_REFERENCE) break; else - return -EPROTO; + return -EINVAL; } if (nargs > MAX_ACPI_REFERENCE_ARGS) - return -EPROTO; + return -EINVAL; if (idx == index) { args->adev = device; @@ -670,13 +669,13 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, return -ENOENT; element++; } else { - return -EPROTO; + return -EINVAL; } idx++; } - return -ENODATA; + return -ENOENT; } EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference); diff --git a/drivers/base/property.c b/drivers/base/property.c index 21fcc13013a5..7ed99c1b2a8b 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -683,6 +683,10 @@ EXPORT_SYMBOL_GPL(fwnode_property_match_string); * Caller is responsible to call fwnode_handle_put() on the returned * args->fwnode pointer. * + * Returns: %0 on success + * %-ENOENT when the index is out of bounds, the index has an empty + * reference or the property was not found + * %-EINVAL on parse error */ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, From 51858a2777f025333c5ac3b3484263bba56461b3 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 11 Oct 2017 11:06:13 +0300 Subject: [PATCH 174/217] ACPI: properties: Fix __acpi_node_get_property_reference() return codes Fix more return codes for device property: Align return codes of __acpi_node_get_property_reference(). In particular, what was missed previously: -EPROTO could be returned in certain cases, now -EINVAL; -EINVAL was returned if the property was not found, now -ENOENT; -EINVAL was returned also if the index was higher than the number of entries in a package, now -ENOENT. Reported-by: Hyungwoo Yang Fixes: 3e3119d3088f (device property: Introduce fwnode_property_get_reference_args) Signed-off-by: Sakari Ailus Tested-by: Hyungwoo Yang Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 5a8ac5e1081b..e26ea209b63e 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -593,7 +593,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) - return ret; + return ret == -EINVAL ? -ENOENT : -EINVAL; /* * The simplest case is when the value is a single reference. Just @@ -605,7 +605,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, ret = acpi_bus_get_device(obj->reference.handle, &device); if (ret) - return ret; + return ret == -ENODEV ? -EINVAL : ret; args->adev = device; args->nargs = 0; @@ -621,8 +621,10 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, * The index argument is then used to determine which reference * the caller wants (along with the arguments). */ - if (obj->type != ACPI_TYPE_PACKAGE || index >= obj->package.count) - return -EPROTO; + if (obj->type != ACPI_TYPE_PACKAGE) + return -EINVAL; + if (index >= obj->package.count) + return -ENOENT; element = obj->package.elements; end = element + obj->package.count; From 9d25e3cc83d731ae4eeb017fd07562fde3f80bef Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 9 Oct 2017 13:40:23 +0200 Subject: [PATCH 175/217] iommu/exynos: Remove initconst attribute to avoid potential kernel oops Exynos SYSMMU registers standard platform device with sysmmu_of_match table, what means that this table is accessed every time a new platform device is registered in a system. This might happen also after the boot, so the table must not be attributed as initconst to avoid potential kernel oops caused by access to freed memory. Fixes: 6b21a5db3642 ("iommu/exynos: Support for device tree") Signed-off-by: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index f596fcc32898..25c2c75f5332 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -709,7 +709,7 @@ static const struct dev_pm_ops sysmmu_pm_ops = { pm_runtime_force_resume) }; -static const struct of_device_id sysmmu_of_match[] __initconst = { +static const struct of_device_id sysmmu_of_match[] = { { .compatible = "samsung,exynos-sysmmu", }, { }, }; From 8eb3f87d903168bdbd1222776a6b1e281f50513e Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 10 Oct 2017 15:01:22 +0800 Subject: [PATCH 176/217] KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit When KVM emulates an exit from L2 to L1, it loads L1 CR4 into the guest CR4. Before this CR4 loading, the guest CR4 refers to L2 CR4. Because these two CR4's are in different levels of guest, we should vmx_set_cr4() rather than kvm_set_cr4() here. The latter, which is used to handle guest writes to its CR4, checks the guest change to CR4 and may fail if the change is invalid. The failure may cause trouble. Consider we start a L1 guest with non-zero L1 PCID in use, (i.e. L1 CR4.PCIDE == 1 && L1 CR3.PCID != 0) and a L2 guest with L2 PCID disabled, (i.e. L2 CR4.PCIDE == 0) and following events may happen: 1. If kvm_set_cr4() is used in load_vmcs12_host_state() to load L1 CR4 into guest CR4 (in VMCS01) for L2 to L1 exit, it will fail because of PCID check. As a result, the guest CR4 recorded in L0 KVM (i.e. vcpu->arch.cr4) is left to the value of L2 CR4. 2. Later, if L1 attempts to change its CR4, e.g., clearing VMXE bit, kvm_set_cr4() in L0 KVM will think L1 also wants to enable PCID, because the wrong L2 CR4 is used by L0 KVM as L1 CR4. As L1 CR3.PCID != 0, L0 KVM will inject GP to L1 guest. Fixes: 4704d0befb072 ("KVM: nVMX: Exiting from L2 to L1") Cc: qemu-stable@nongnu.org Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a2b804e10c95..95a01609d7ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11297,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); From 8a60aea62100c79fb5d151a1e261f11534c3dbff Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 9 Oct 2017 16:34:01 +0200 Subject: [PATCH 177/217] MAINTAINERS: Add Paul Mackerras as maintainer for KVM/powerpc Paul is handling almost all of the powerpc related KVM patches nowadays, so he should be mentioned in the MAINTAINERS file accordingly. Signed-off-by: Thomas Huth Signed-off-by: Paolo Bonzini --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2d3d750b19c0..6457e5163f93 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7571,7 +7571,7 @@ F: arch/mips/include/asm/kvm* F: arch/mips/kvm/ KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc) -M: Alexander Graf +M: Paul Mackerras L: kvm-ppc@vger.kernel.org W: http://www.linux-kvm.org/ T: git git://github.com/agraf/linux-2.6.git From 0d923820c6db1644c27c2d0a5af8920fc0f8cd81 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Wed, 4 Oct 2017 12:20:52 +0530 Subject: [PATCH 178/217] powerpc/perf: Fix for core/nest imc call trace on cpuhotplug Nest/core pmu units are enabled only when it is used. A reference count is maintained for the events which uses the nest/core pmu units. Currently in *_imc_counters_release function a WARN() is used for notification of any underflow of ref count. The case where event ref count hit a negative value is, when perf session is started, followed by offlining of all cpus in a given core. i.e. in cpuhotplug offline path ppc_core_imc_cpu_offline() function set the ref->count to zero, if the current cpu which is about to offline is the last cpu in a given core and make an OPAL call to disable the engine in that core. And on perf session termination, perf->destroy (core_imc_counters_release) will first decrement the ref->count for this core and based on the ref->count value an opal call is made to disable the core-imc engine. Now, since cpuhotplug path already clears the ref->count for core and disabled the engine, perf->destroy() decrementing again at event termination make it negative which in turn fires the WARN_ON. The same happens for nest units. Add a check to see if the reference count is alreday zero, before decrementing the count, so that the ref count will not hit a negative value. Signed-off-by: Anju T Sudhakar Reviewed-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 9ccac86f3463..e3a1f65933b5 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -399,6 +399,20 @@ static void nest_imc_counters_release(struct perf_event *event) /* Take the mutex lock for this node and then decrement the reference count */ mutex_lock(&ref->lock); + if (ref->refc == 0) { + /* + * The scenario where this is true is, when perf session is + * started, followed by offlining of all cpus in a given node. + * + * In the cpuhotplug offline path, ppc_nest_imc_cpu_offline() + * function set the ref->count to zero, if the cpu which is + * about to offline is the last cpu in a given node and make + * an OPAL call to disable the engine in that node. + * + */ + mutex_unlock(&ref->lock); + return; + } ref->refc--; if (ref->refc == 0) { rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, @@ -646,6 +660,20 @@ static void core_imc_counters_release(struct perf_event *event) return; mutex_lock(&ref->lock); + if (ref->refc == 0) { + /* + * The scenario where this is true is, when perf session is + * started, followed by offlining of all cpus in a given core. + * + * In the cpuhotplug offline path, ppc_core_imc_cpu_offline() + * function set the ref->count to zero, if the cpu which is + * about to offline is the last cpu in a given core and make + * an OPAL call to disable the engine in that core. + * + */ + mutex_unlock(&ref->lock); + return; + } ref->refc--; if (ref->refc == 0) { rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, From cd4f2b30e5ef7d4bde61eb515372d96e8aec1690 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Wed, 11 Oct 2017 18:27:39 +0530 Subject: [PATCH 179/217] powerpc/perf: Add ___GFP_NOWARN flag to alloc_pages_node() Stack trace output during a stress test: [ 4.310049] Freeing initrd memory: 22592K [ 4.310646] rtas_flash: no firmware flash support [ 4.313341] cpuhp/64: page allocation failure: order:0, mode:0x14480c0(GFP_KERNEL|__GFP_ZERO|__GFP_THISNODE), nodemask=(null) [ 4.313465] cpuhp/64 cpuset=/ mems_allowed=0 [ 4.313521] CPU: 64 PID: 392 Comm: cpuhp/64 Not tainted 4.11.0-39.el7a.ppc64le #1 [ 4.313588] Call Trace: [ 4.313622] [c000000f1fb1b8e0] [c000000000c09388] dump_stack+0xb0/0xf0 (unreliable) [ 4.313694] [c000000f1fb1b920] [c00000000030ef6c] warn_alloc+0x12c/0x1c0 [ 4.313753] [c000000f1fb1b9c0] [c00000000030ff68] __alloc_pages_nodemask+0xea8/0x1000 [ 4.313823] [c000000f1fb1bbb0] [c000000000113a8c] core_imc_mem_init+0xbc/0x1c0 [ 4.313892] [c000000f1fb1bc00] [c000000000113cdc] ppc_core_imc_cpu_online+0x14c/0x170 [ 4.313962] [c000000f1fb1bc90] [c000000000125758] cpuhp_invoke_callback+0x198/0x5d0 [ 4.314031] [c000000f1fb1bd00] [c00000000012782c] cpuhp_thread_fun+0x8c/0x3d0 [ 4.314101] [c000000f1fb1bd60] [c0000000001678d0] smpboot_thread_fn+0x290/0x2a0 [ 4.314169] [c000000f1fb1bdc0] [c00000000015ee78] kthread+0x168/0x1b0 [ 4.314229] [c000000f1fb1be30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74 [ 4.314313] Mem-Info: [ 4.314356] active_anon:0 inactive_anon:0 isolated_anon:0 core_imc_mem_init() at system boot use alloc_pages_node() to get memory and alloc_pages_node() throws this stack dump when tried to allocate memory from a node which has no memory behind it. Add a ___GFP_NOWARN flag in allocation request as a fix. Signed-off-by: Anju T Sudhakar Reported-by: Michael Ellerman Reported-by: Venkat R.B Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index e3a1f65933b5..39a0203fd8a5 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -537,8 +537,8 @@ static int core_imc_mem_init(int cpu, int size) /* We need only vbase for core counters */ mem_info->vbase = page_address(alloc_pages_node(phys_id, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - get_order(size))); + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); if (!mem_info->vbase) return -ENOMEM; @@ -791,8 +791,8 @@ static int thread_imc_mem_alloc(int cpu_id, int size) * free the memory in cpu offline path. */ local_mem = page_address(alloc_pages_node(phys_id, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - get_order(size))); + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); if (!local_mem) return -ENOMEM; From 27b94b4f1386c3a8181f5a0277434a32e24e7dd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 1 Sep 2017 09:22:56 +0200 Subject: [PATCH 180/217] drm/amdgpu: fix placement flags in amdgpu_ttm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise we lose the NO_EVICT flag and can try to evict pinned BOs. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 7ef6c28a34d9..bc746131987f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -834,7 +834,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem) placement.busy_placement = &placements; placements.fpfn = 0; placements.lpfn = adev->mc.gart_size >> PAGE_SHIFT; - placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT; + placements.flags = bo->mem.placement | TTM_PL_FLAG_TT; r = ttm_bo_mem_space(bo, &placement, &tmp, true, false); if (unlikely(r)) From aa3c2ba1c3a7c25d0440a8ac3ddd266c0f43b7b7 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 12 Oct 2017 08:37:45 -0400 Subject: [PATCH 181/217] drm/msm/mdp5: add missing max size for 8x74 v1 This should have same max width as v2. Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c index c2bdad88447e..824067d2d427 100644 --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c @@ -83,6 +83,8 @@ const struct mdp5_cfg_hw msm8x74v1_config = { .caps = MDP_LM_CAP_WB }, }, .nb_stages = 5, + .max_width = 2048, + .max_height = 0xFFFF, }, .dspp = { .count = 3, From f44001e2637138d9d506efe8da67011f8170e860 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 2 Oct 2017 10:28:37 -0400 Subject: [PATCH 182/217] drm/msm: use proper memory barriers for updating tail/head Fixes intermittent corruption of cmdstream dump. Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_rd.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c index 0366b8092f97..ec56794ad039 100644 --- a/drivers/gpu/drm/msm/msm_rd.c +++ b/drivers/gpu/drm/msm/msm_rd.c @@ -111,10 +111,14 @@ static void rd_write(struct msm_rd_state *rd, const void *buf, int sz) wait_event(rd->fifo_event, circ_space(&rd->fifo) > 0); + /* Note that smp_load_acquire() is not strictly required + * as CIRC_SPACE_TO_END() does not access the tail more + * than once. + */ n = min(sz, circ_space_to_end(&rd->fifo)); memcpy(fptr, ptr, n); - fifo->head = (fifo->head + n) & (BUF_SZ - 1); + smp_store_release(&fifo->head, (fifo->head + n) & (BUF_SZ - 1)); sz -= n; ptr += n; @@ -145,13 +149,17 @@ static ssize_t rd_read(struct file *file, char __user *buf, if (ret) goto out; + /* Note that smp_load_acquire() is not strictly required + * as CIRC_CNT_TO_END() does not access the head more than + * once. + */ n = min_t(int, sz, circ_count_to_end(&rd->fifo)); if (copy_to_user(buf, fptr, n)) { ret = -EFAULT; goto out; } - fifo->tail = (fifo->tail + n) & (BUF_SZ - 1); + smp_store_release(&fifo->tail, (fifo->tail + n) & (BUF_SZ - 1)); *ppos += n; wake_up_all(&rd->fifo_event); From c9811d0fa55929b182f62e0ee49b71b0bea6a936 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 11 Oct 2017 11:36:56 +0000 Subject: [PATCH 183/217] drm/msm: fix return value check in _msm_gem_kernel_new() In case of error, the function msm_gem_get_vaddr() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: 8223286d62e2 ("drm/msm: Add a helper function for in-kernel buffer allocations") Signed-off-by: Wei Yongjun Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index f15821a0d900..0b338fbf97ce 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1045,10 +1045,10 @@ static void *_msm_gem_kernel_new(struct drm_device *dev, uint32_t size, } vaddr = msm_gem_get_vaddr(obj); - if (!vaddr) { + if (IS_ERR(vaddr)) { msm_gem_put_iova(obj, aspace); drm_gem_object_unreference(obj); - return ERR_PTR(-ENOMEM); + return ERR_CAST(vaddr); } if (bo) From cc6afe2240298049585e86b1ade85efc8a7f225d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Oct 2017 12:12:57 +0200 Subject: [PATCH 184/217] x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors Commit 594a30fb1242 ("x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on CPUs without the feature", 2017-08-30) was also about silencing the warning on VirtualBox; however, KVM does expose the TSC deadline timer, and it's virtualized so that it is immune from CPU errata. Therefore, booting 4.13 with "-cpu Haswell" shows this in the logs: [ 0.000000] [Firmware Bug]: TSC_DEADLINE disabled due to Errata; please update microcode to version: 0xb2 (or later) Even if you had a hypervisor that does _not_ virtualize the TSC deadline and rather exposes the hardware one, it should be the hypervisors task to update microcode and possibly hide the flag from CPUID. So just hide the message when running on _any_ hypervisor, not just those that do not support the TSC deadline timer. The older check still makes sense, so keep it. Fixes: bd9240a18e ("x86/apic: Add TSC_DEADLINE quirk due to errata") Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Hans de Goede Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1507630377-54471-1-git-send-email-pbonzini@redhat.com --- arch/x86/kernel/apic/apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d705c769f77d..50109eae8cd7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -600,7 +600,8 @@ static void apic_check_deadline_errata(void) const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || + boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; m = x86_match_cpu(deadline_match); From 616dd5872e52493863b0202632703eebd51243dc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 11 Oct 2017 17:16:04 -0400 Subject: [PATCH 185/217] x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping SKX stepping-3 fixed the TSC_DEADLINE issue in a different ucode version number than stepping-4. Linux needs to know this stepping-3 specific version number to also enable the TSC_DEADLINE on stepping-3. The steppings and ucode versions are documented in the SKX BIOS update: https://downloadmirror.intel.com/26978/eng/ReleaseNotes_R00.01.0004.txt Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/60f2bbf7cf617e212b522e663f84225bfebc50e5.1507756305.git.len.brown@intel.com --- arch/x86/kernel/apic/apic.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50109eae8cd7..ff891772c9f8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void) return ~0U; } +static u32 skx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x03: return 0x01000136; + case 0x04: return 0x02000014; + } + + return ~0U; +} + static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), From a18a0ea0096833ecb52053b183fcf9709f7bafd8 Mon Sep 17 00:00:00 2001 From: Archit Taneja Date: Fri, 6 Oct 2017 16:27:06 +0530 Subject: [PATCH 186/217] drm/msm/dsi: Use correct pm_runtime_put variant during host_init The DSI runtime PM suspend/resume callbacks check whether msm_host->cfg_hnd is non-NULL before trying to enable the bus clocks. This is done to accommodate early calls to these functions that may happen before the bus clocks are even initialized. Calling pm_runtime_put_autosuspend() in dsi_host_init() can result in racy behaviour since msm_host->cfg_hnd is set very soon after. If the suspend callback happens too late, we end up trying to disable clocks that were never enabled, resulting in a bunch of WARN_ON splats. Use pm_runtime_put_sync() so that the suspend callback is called immediately. Reported-by: Nicolas Dechesne Signed-off-by: Archit Taneja Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/dsi_host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c index dbb31a014419..deaf869374ea 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_host.c +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c @@ -248,7 +248,7 @@ static const struct msm_dsi_cfg_handler *dsi_get_config( clk_disable_unprepare(ahb_clk); disable_gdsc: regulator_disable(gdsc_reg); - pm_runtime_put_autosuspend(dev); + pm_runtime_put_sync(dev); put_clk: clk_put(ahb_clk); put_gdsc: From 9e4621531e2af230611c28c67306a31e1a09f76a Mon Sep 17 00:00:00 2001 From: Archit Taneja Date: Fri, 6 Oct 2017 16:27:07 +0530 Subject: [PATCH 187/217] drm/msm/mdp5: Remove extra pm_runtime_put call in mdp5_crtc_cursor_set() While converting mdp5_enable/disable() calls to pm_runtime_get/put() API, an extra call to pm_runtime_put_autosuspend() crept in mdp5_crtc_cursor_set(). This results in calling the suspend handler twice, and therefore clk_disables twice, which isn't a nice thing to do. Fixes: d68fe15b1878 (drm/msm/mdp5: Use runtime PM get/put API instead ...) Reported-by: Stanimir Varbanov Signed-off-by: Archit Taneja Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c index 6fcb58ab718c..440977677001 100644 --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c @@ -804,8 +804,6 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc, spin_unlock_irqrestore(&mdp5_crtc->cursor.lock, flags); - pm_runtime_put_autosuspend(&pdev->dev); - set_cursor: ret = mdp5_ctl_set_cursor(ctl, pipeline, 0, cursor_enable); if (ret) { From db179e0d0d1003f10b798e072524be6bcdae5053 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Sep 2017 15:25:10 +0900 Subject: [PATCH 188/217] of: do not leak console options Do not strdup() console options. It seems that the only reason for it to be strdup()-ed was a compilation warning: printk, UART and console drivers, for some reason, expect char pointer instead of const char pointer. So we can just pass `of_stdout_options', but need to cast it to char pointer. A better fix would be to change printk, console drivers and UART to accept const char `options'; but that will take time - there are lots of drivers to update. The patch also fixes a possible memory leak: add_preferred_console() can fail, but we don't kfree() options. Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Rob Herring --- drivers/of/base.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 260d33c0f26c..63897531cd75 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1781,8 +1781,12 @@ bool of_console_check(struct device_node *dn, char *name, int index) { if (!dn || dn != of_stdout || console_set_on_cmdline) return false; - return !add_preferred_console(name, index, - kstrdup(of_stdout_options, GFP_KERNEL)); + + /* + * XXX: cast `options' to char pointer to suppress complication + * warnings: printk, UART and console drivers expect char pointer. + */ + return !add_preferred_console(name, index, (char *)of_stdout_options); } EXPORT_SYMBOL_GPL(of_console_check); From 22f8cc6e33731678e7687a18ffd0f578131edb4c Mon Sep 17 00:00:00 2001 From: Stewart Smith Date: Tue, 26 Sep 2017 18:40:00 +1000 Subject: [PATCH 189/217] drivers: of: increase MAX_RESERVED_REGIONS to 32 There are two types of memory reservations firmware can ask the kernel to make in the device tree: static and dynamic. See Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt If you have greater than 16 entries in /reserved-memory (as we do on POWER9 systems) you would get this scary looking error message: [ 0.000000] OF: reserved mem: not enough space all defined regions. This is harmless if all your reservations are static (which with OPAL on POWER9, they are). It is not harmless if you have any dynamic reservations after the 16th. In the first pass over the fdt to find reservations, the child nodes of /reserved-memory are added to a static array in of_reserved_mem.c so that memory can be reserved in a 2nd pass. The array has 16 entries. This is why, on my dual socket POWER9 system, I get that error 4 times with 20 static reservations. We don't have a problem on ppc though, as in arch/powerpc/kernel/prom.c we look at the new style /reserved-ranges property to do reservations, and this logic was introduced in 0962e8004e974 (well before any powernv system shipped). A Google search shows up no occurances of that exact error message, so we're probably safe in that no machine that people use has memory not being reserved when it should be. The simple fix is to bump the length of the array to 32 which "should be enough for everyone(TM)". The simple fix of not recording static allocations in the array would cause problems for devices with "memory-region" properties. A more future-proof fix is likely possible, although more invasive and this simple fix is perfectly suitable in the meantime while a more future-proof fix is developed. Signed-off-by: Stewart Smith Tested-by: Mauricio Faria de Oliveira Signed-off-by: Rob Herring --- drivers/of/of_reserved_mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index d507c3569a88..32771c2ced7b 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -25,7 +25,7 @@ #include #include -#define MAX_RESERVED_REGIONS 16 +#define MAX_RESERVED_REGIONS 32 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; static int reserved_mem_count; From 3314c6bdd26880e0dfbcb0cb85a1b36d185ce47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Tue, 22 Aug 2017 02:19:12 +0200 Subject: [PATCH 190/217] device property: preserve usecount for node passed to of_fwnode_graph_get_port_parent() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using CONFIG_OF_DYNAMIC=y uncovered an imbalance in the usecount of the node being passed to of_fwnode_graph_get_port_parent(). Preserve the usecount by using of_get_parent() instead of of_get_next_parent() which don't decrement the usecount of the node passed to it. Fixes: 3b27d00e7b6d7c88 ("device property: Move fwnode graph ops to firmware specific locations") Signed-off-by: Niklas Söderlund Acked-by: Sakari Ailus Signed-off-by: Rob Herring --- drivers/of/property.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index fbb72116e9d4..264c355ba1ff 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -954,7 +954,7 @@ of_fwnode_graph_get_port_parent(struct fwnode_handle *fwnode) struct device_node *np; /* Get the parent of the port */ - np = of_get_next_parent(to_of_node(fwnode)); + np = of_get_parent(to_of_node(fwnode)); if (!np) return NULL; From 6bd6ae2dfc7e091059fd8a650579bb1efc9b4b9f Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 24 Aug 2017 14:24:29 -0400 Subject: [PATCH 191/217] drm/msm: fix error path cleanup If we fail to attach iommu, gpu->aspace could be IS_ERR().. Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index ffbff27600e0..6a887032c66a 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -718,7 +718,8 @@ void msm_gpu_cleanup(struct msm_gpu *gpu) msm_gem_put_iova(gpu->rb->bo, gpu->aspace); msm_ringbuffer_destroy(gpu->rb); } - if (gpu->aspace) { + + if (!IS_ERR_OR_NULL(gpu->aspace)) { gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu, NULL, 0); msm_gem_address_space_put(gpu->aspace); From 06451a3d1d777141dedfa947649cbb0c594ac3af Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 12 Sep 2017 14:23:05 -0400 Subject: [PATCH 192/217] drm/msm: fix _NO_IMPLICIT fencing case We need to call reservation_object_reserve_shared() in both cases, but this wasn't happening in the _NO_IMPLICIT submit case. Fixes: f0a42bb ("drm/msm: submit support for in-fences") Reported-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 11 ----------- drivers/gpu/drm/msm/msm_gem_submit.c | 24 ++++++++++++++++++------ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 0b338fbf97ce..ea5bb0e1632c 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -610,17 +610,6 @@ int msm_gem_sync_object(struct drm_gem_object *obj, struct dma_fence *fence; int i, ret; - if (!exclusive) { - /* NOTE: _reserve_shared() must happen before _add_shared_fence(), - * which makes this a slightly strange place to call it. OTOH this - * is a convenient can-fail point to hook it in. (And similar to - * how etnaviv and nouveau handle this.) - */ - ret = reservation_object_reserve_shared(msm_obj->resv); - if (ret) - return ret; - } - fobj = reservation_object_get_list(msm_obj->resv); if (!fobj || (fobj->shared_count == 0)) { fence = reservation_object_get_excl(msm_obj->resv); diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 5d0a75d4b249..93535cac0676 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -221,7 +221,7 @@ static int submit_lock_objects(struct msm_gem_submit *submit) return ret; } -static int submit_fence_sync(struct msm_gem_submit *submit) +static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) { int i, ret = 0; @@ -229,6 +229,20 @@ static int submit_fence_sync(struct msm_gem_submit *submit) struct msm_gem_object *msm_obj = submit->bos[i].obj; bool write = submit->bos[i].flags & MSM_SUBMIT_BO_WRITE; + if (!write) { + /* NOTE: _reserve_shared() must happen before + * _add_shared_fence(), which makes this a slightly + * strange place to call it. OTOH this is a + * convenient can-fail point to hook it in. + */ + ret = reservation_object_reserve_shared(msm_obj->resv); + if (ret) + return ret; + } + + if (no_implicit) + continue; + ret = msm_gem_sync_object(&msm_obj->base, submit->gpu->fctx, write); if (ret) break; @@ -451,11 +465,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (ret) goto out; - if (!(args->flags & MSM_SUBMIT_NO_IMPLICIT)) { - ret = submit_fence_sync(submit); - if (ret) - goto out; - } + ret = submit_fence_sync(submit, !!(args->flags & MSM_SUBMIT_NO_IMPLICIT)); + if (ret) + goto out; ret = submit_pin_objects(submit); if (ret) From 2aab9c3ca47dd4fcc19a8743c6e4d348640dd3fa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 12 Oct 2017 14:22:04 +1100 Subject: [PATCH 193/217] scripts: fix faddr2line to work on last symbol If faddr2line is given a function name which is the last one listed by "nm -n", it will fail because it never finds the next symbol. So teach the awk script to catch that possibility, and use 'size' to provide the end point of the last function. Signed-off-by: NeilBrown Signed-off-by: Linus Torvalds --- scripts/faddr2line | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 29df825d375c..2f6ce802397d 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -103,11 +103,12 @@ __faddr2line() { # Go through each of the object's symbols which match the func name. # In rare cases there might be duplicates. + file_end=$(size -Ax $objfile | awk '$1 == ".text" {print $2}') while read symbol; do local fields=($symbol) local sym_base=0x${fields[0]} local sym_type=${fields[1]} - local sym_end=0x${fields[3]} + local sym_end=${fields[3]} # calculate the size local sym_size=$(($sym_end - $sym_base)) @@ -157,7 +158,7 @@ __faddr2line() { addr2line -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;" DONE=1 - done < <(nm -n $objfile | awk -v fn=$func '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, $1 }') + done < <(nm -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }') } [[ $# -lt 2 ]] && usage From 0d8ba16278ec30a262d931875018abee332f926f Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Fri, 13 Oct 2017 11:29:41 +0530 Subject: [PATCH 194/217] powerpc/perf: Fix IMC initialization crash Panic observed with latest firmware, and upstream kernel: NIP init_imc_pmu+0x8c/0xcf0 LR init_imc_pmu+0x2f8/0xcf0 Call Trace: init_imc_pmu+0x2c8/0xcf0 (unreliable) opal_imc_counters_probe+0x300/0x400 platform_drv_probe+0x64/0x110 driver_probe_device+0x3d8/0x580 __driver_attach+0x14c/0x1a0 bus_for_each_dev+0x8c/0xf0 driver_attach+0x34/0x50 bus_add_driver+0x298/0x350 driver_register+0x9c/0x180 __platform_driver_register+0x5c/0x70 opal_imc_driver_init+0x2c/0x40 do_one_initcall+0x64/0x1d0 kernel_init_freeable+0x280/0x374 kernel_init+0x24/0x160 ret_from_kernel_thread+0x5c/0x74 While registering nest imc at init, cpu-hotplug callback nest_pmu_cpumask_init() makes an OPAL call to stop the engine. And if the OPAL call fails, imc_common_cpuhp_mem_free() is invoked to cleanup memory and cpuhotplug setup. But when cleaning up the attribute group, we are dereferencing the attribute element array without checking whether the backing element is not NULL. This causes the kernel panic. Add a check for the backing element prior to dereferencing the attribute element, to handle the failing case gracefully. Signed-off-by: Anju T Sudhakar Reported-by: Pridhiviraj Paidipeddi [mpe: Trim change log] Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 39a0203fd8a5..88126245881b 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1176,7 +1176,8 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) } /* Only free the attr_groups which are dynamically allocated */ - kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); + if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); kfree(pmu_ptr); return; From ce76353f169a6471542d999baf3d29b121dce9c0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 13 Oct 2017 14:32:37 +0200 Subject: [PATCH 195/217] iommu/amd: Finish TLB flush in amd_iommu_unmap() The function only sends the flush command to the IOMMU(s), but does not wait for its completion when it returns. Fix that. Fixes: 601367d76bd1 ('x86/amd-iommu: Remove iommu_flush_domain function') Cc: stable@vger.kernel.org # >= 2.6.33 Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 822679ac90a1..8e8874d23717 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3048,6 +3048,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, mutex_unlock(&domain->api_lock); domain_flush_tlb_pde(domain); + domain_flush_complete(domain); return unmap_size; } From e20d103b6c37038ca27409f746f0b3351bcd0c44 Mon Sep 17 00:00:00 2001 From: Mark Hairgrove Date: Fri, 13 Oct 2017 15:57:30 -0700 Subject: [PATCH 196/217] mm/migrate: fix indexing bug (off by one) and avoid out of bound access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Index was incremented before last use and thus the second array could dereference to an invalid address (not mentioning the fact that it did not properly clear the entry we intended to clear). Link: http://lkml.kernel.org/r/1506973525-16491-1-git-send-email-jglisse@redhat.com Fixes: 8315ada7f095bf ("mm/migrate: allow migrate_vma() to alloc new page on empty entry") Signed-off-by: Mark Hairgrove Signed-off-by: Jérôme Glisse Cc: Reza Arbab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6954c1435833..e00814ca390e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2146,8 +2146,9 @@ static int migrate_vma_collect_hole(unsigned long start, unsigned long addr; for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE; + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; + migrate->npages++; migrate->cpages++; } From cc3fa84045694c2fd7ccf6ce84dee5cba372a7d3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 13 Oct 2017 15:57:33 -0700 Subject: [PATCH 197/217] lib/Kconfig.debug: kernel hacking menu: runtime testing: keep tests together Expand the "Runtime testing" menu by including more entries inside it instead of after it. This is just Kconfig symbol movement. This causes the (arch-independent) Runtime tests to be presented (listed) all in one place instead of in multiple places. Link: http://lkml.kernel.org/r/c194e5c4-2042-bf94-a2d8-7aa13756e257@infradead.org Signed-off-by: Randy Dunlap Acked-by: Kees Cook Cc: Dave Hansen Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 143 +++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 72 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2689b7c50c52..c1e720a22c71 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1590,6 +1590,54 @@ config LATENCYTOP source kernel/trace/Kconfig +config PROVIDE_OHCI1394_DMA_INIT + bool "Remote debugging over FireWire early on boot" + depends on PCI && X86 + help + If you want to debug problems which hang or crash the kernel early + on boot and the crashing machine has a FireWire port, you can use + this feature to remotely access the memory of the crashed machine + over FireWire. This employs remote DMA as part of the OHCI1394 + specification which is now the standard for FireWire controllers. + + With remote DMA, you can monitor the printk buffer remotely using + firescope and access all memory below 4GB using fireproxy from gdb. + Even controlling a kernel debugger is possible using remote DMA. + + Usage: + + If ohci1394_dma=early is used as boot parameter, it will initialize + all OHCI1394 controllers which are found in the PCI config space. + + As all changes to the FireWire bus such as enabling and disabling + devices cause a bus reset and thereby disable remote DMA for all + devices, be sure to have the cable plugged and FireWire enabled on + the debugging host before booting the debug target for debugging. + + This code (~1k) is freed after boot. By then, the firewire stack + in charge of the OHCI-1394 controllers should be used instead. + + See Documentation/debugging-via-ohci1394.txt for more information. + +config DMA_API_DEBUG + bool "Enable debugging of DMA-API usage" + depends on HAVE_DMA_API_DEBUG + help + Enable this option to debug the use of the DMA API by device drivers. + With this option you will be able to detect common bugs in device + drivers like double-freeing of DMA mappings or freeing mappings that + were never allocated. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. + menu "Runtime Testing" config LKDTM @@ -1749,56 +1797,6 @@ config TEST_PARMAN If unsure, say N. -endmenu # runtime tests - -config PROVIDE_OHCI1394_DMA_INIT - bool "Remote debugging over FireWire early on boot" - depends on PCI && X86 - help - If you want to debug problems which hang or crash the kernel early - on boot and the crashing machine has a FireWire port, you can use - this feature to remotely access the memory of the crashed machine - over FireWire. This employs remote DMA as part of the OHCI1394 - specification which is now the standard for FireWire controllers. - - With remote DMA, you can monitor the printk buffer remotely using - firescope and access all memory below 4GB using fireproxy from gdb. - Even controlling a kernel debugger is possible using remote DMA. - - Usage: - - If ohci1394_dma=early is used as boot parameter, it will initialize - all OHCI1394 controllers which are found in the PCI config space. - - As all changes to the FireWire bus such as enabling and disabling - devices cause a bus reset and thereby disable remote DMA for all - devices, be sure to have the cable plugged and FireWire enabled on - the debugging host before booting the debug target for debugging. - - This code (~1k) is freed after boot. By then, the firewire stack - in charge of the OHCI-1394 controllers should be used instead. - - See Documentation/debugging-via-ohci1394.txt for more information. - -config DMA_API_DEBUG - bool "Enable debugging of DMA-API usage" - depends on HAVE_DMA_API_DEBUG - help - Enable this option to debug the use of the DMA API by device drivers. - With this option you will be able to detect common bugs in device - drivers like double-freeing of DMA mappings or freeing mappings that - were never allocated. - - This also attempts to catch cases where a page owned by DMA is - accessed by the cpu in a way that could cause data corruption. For - example, this enables cow_user_page() to check that the source page is - not undergoing DMA. - - This option causes a performance degradation. Use only if you want to - debug device drivers and dma interactions. - - If unsure, say N. - config TEST_LKM tristate "Test module loading with 'hello world' module" default n @@ -1873,18 +1871,6 @@ config TEST_UDELAY If unsure, say N. -config MEMTEST - bool "Memtest" - depends on HAVE_MEMBLOCK - ---help--- - This option adds a kernel parameter 'memtest', which allows memtest - to be set. - memtest=0, mean disabled; -- default - memtest=1, mean do 1 test pattern; - ... - memtest=17, mean do 17 test patterns. - If you are unsure how to answer this question, answer N. - config TEST_STATIC_KEYS tristate "Test static keys" default n @@ -1894,16 +1880,6 @@ config TEST_STATIC_KEYS If unsure, say N. -config BUG_ON_DATA_CORRUPTION - bool "Trigger a BUG when data corruption is detected" - select DEBUG_LIST - help - Select this option if the kernel should BUG when it encounters - data corruption in kernel memory structures when they get checked - for validity. - - If unsure, say N. - config TEST_KMOD tristate "kmod stress tester" default n @@ -1941,6 +1917,29 @@ config TEST_DEBUG_VIRTUAL If unsure, say N. +endmenu # runtime tests + +config MEMTEST + bool "Memtest" + depends on HAVE_MEMBLOCK + ---help--- + This option adds a kernel parameter 'memtest', which allows memtest + to be set. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=17, mean do 17 test patterns. + If you are unsure how to answer this question, answer N. + +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. source "samples/Kconfig" From c02c30093254189a6ef55fed415a4ffb55a74cdf Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 13 Oct 2017 15:57:37 -0700 Subject: [PATCH 198/217] mm/madvise.c: add description for MADV_WIPEONFORK and MADV_KEEPONFORK mm/madvise.c has a brief description about all MADV_ flags. Add a description for the newly added MADV_WIPEONFORK and MADV_KEEPONFORK. Although man page has the similar information, but it'd better to keep the consistent with other flags. Link: http://lkml.kernel.org/r/1506117328-88228-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 25bade36e9ca..fd70d6aabc3e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -757,6 +757,9 @@ madvise_behavior_valid(int behavior) * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_WIPEONFORK - present the child process with zero-filled memory in this + * range after a fork. + * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. @@ -777,7 +780,9 @@ madvise_behavior_valid(int behavior) * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application - * is attempting to release locked or shared pages. + * is attempting to release locked or shared pages, + * or the specified address range includes file, Huge TLB, + * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. From 8a1ac5dc7be09883051b1bf89a5e57d7ad850fa5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 15:57:40 -0700 Subject: [PATCH 199/217] include/linux/of.h: provide of_n_{addr,size}_cells wrappers for !CONFIG_OF The pci-rcar driver is enabled for compile tests, and this has shown that the driver cannot build without CONFIG_OF, following the inclusion of commit f8f2fe7355fb ("PCI: rcar: Use new OF interrupt mapping when possible"): drivers/pci/host/pcie-rcar.c: In function 'pci_dma_range_parser_init': drivers/pci/host/pcie-rcar.c:1039:2: error: implicit declaration of function 'of_n_addr_cells' [-Werror=implicit-function-declaration] parser->pna = of_n_addr_cells(node); ^ As pointed out by Ben Dooks and Geert Uytterhoeven, this is actually supposed to build fine, which we can achieve if we make the declaration of of_irq_parse_and_map_pci conditional on CONFIG_OF and provide an empty inline function otherwise, as we do for a lot of other of interfaces. This lets us build the rcar_pci driver again without CONFIG_OF for build testing. All platforms using this driver select OF, so this doesn't change anything for the users. [akpm@linux-foundation.org: be consistent with surrounding code] Link: http://lkml.kernel.org/r/20170911200805.3363318-1-arnd@arndb.de Fixes: c25da4778803 ("PCI: rcar: Add Renesas R-Car PCIe driver") Signed-off-by: Arnd Bergmann Reviewed-by: Frank Rowand Acked-by: Geert Uytterhoeven Cc: Bjorn Helgaas Cc: Magnus Damm Cc: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/of.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/of.h b/include/linux/of.h index cfc34117fc92..b240ed69dc96 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -734,6 +734,16 @@ static inline struct device_node *of_get_cpu_node(int cpu, return NULL; } +static inline int of_n_addr_cells(struct device_node *np) +{ + return 0; + +} +static inline int of_n_size_cells(struct device_node *np) +{ + return 0; +} + static inline int of_property_read_u64(const struct device_node *np, const char *propname, u64 *out_value) { From de55c8b251974247edda38e952da8e8dd71683ec Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Oct 2017 15:57:43 -0700 Subject: [PATCH 200/217] mm/mempolicy: fix NUMA_INTERLEAVE_HIT counter Commit 3a321d2a3dde ("mm: change the call sites of numa statistics items") separated NUMA counters from zone counters, but the NUMA_INTERLEAVE_HIT call site wasn't updated to use the new interface. So alloc_page_interleave() actually increments NR_ZONE_INACTIVE_FILE instead of NUMA_INTERLEAVE_HIT. Fix this by using __inc_numa_state() interface to increment NUMA_INTERLEAVE_HIT. Link: http://lkml.kernel.org/r/20171003191003.8573-1-aryabinin@virtuozzo.com Fixes: 3a321d2a3dde ("mm: change the call sites of numa statistics items") Signed-off-by: Andrey Ryabinin Acked-by: Mel Gorman Cc: Kemi Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 006ba625c0b8..a2af6d58a68f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1920,8 +1920,11 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, struct page *page; page = __alloc_pages(gfp, order, nid); - if (page && page_to_nid(page) == nid) - inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); + if (page && page_to_nid(page) == nid) { + preempt_disable(); + __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); + } return page; } From af0db981f35ea99b00a0b249bf0bedef8cf972e8 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 13 Oct 2017 15:57:47 -0700 Subject: [PATCH 201/217] mm: remove unnecessary WARN_ONCE in page_vma_mapped_walk(). A non present pmd entry can appear after pmd_lock is taken in page_vma_mapped_walk(), even if THP migration is not enabled. The WARN_ONCE is unnecessary. Link: http://lkml.kernel.org/r/20171003142606.12324-1-zi.yan@sent.com Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path") Signed-off-by: Zi Yan Reported-by: Abdul Haleem Tested-by: Abdul Haleem Acked-by: Kirill A. Shutemov Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_vma_mapped.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 6a03946469a9..eb462e7db0a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -167,8 +167,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return not_found(pvmw); return true; } - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } return not_found(pvmw); } else { /* THP pmd was split under us: handle on pte level */ From 064f0e9302af4f4ab5e9dca03a5a77d6bebfd35e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 13 Oct 2017 15:57:50 -0700 Subject: [PATCH 202/217] mm: only display online cpus of the numa node When I execute numactl -H (which reads /sys/devices/system/node/nodeX/cpumap and displays cpumask_of_node for each node), I get different result on X86 and arm64. For each numa node, the former only displayed online CPUs, and the latter displayed all possible CPUs. Unfortunately, both Linux documentation and numactl manual have not described it clear. I sent a mail to ask for help, and Michal Hocko replied that he preferred to print online cpus because it doesn't really make much sense to bind anything on offline nodes. Will said: "I suspect the vast majority (if not all) code that reads this file was developed for x86, so having the same behaviour for arm64 sounds like something we should do ASAP before people try to special case with things like #ifdef __aarch64__. I'd rather have this in 4.14 if possible." Link: http://lkml.kernel.org/r/1506678805-15392-2-git-send-email-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Will Deacon Cc: Greg Kroah-Hartman Cc: Tianhong Ding Cc: Hanjun Guo Cc: Libin Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902f2c5b..aae2402f3791 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -27,13 +27,21 @@ static struct bus_type node_subsys = { static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf) { + ssize_t n; + cpumask_var_t mask; struct node *node_dev = to_node(dev); - const struct cpumask *mask = cpumask_of_node(node_dev->dev.id); /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */ BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1)); - return cpumap_print_to_pagebuf(list, buf, mask); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return 0; + + cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask); + n = cpumap_print_to_pagebuf(list, buf, mask); + free_cpumask_var(mask); + + return n; } static inline ssize_t node_read_cpumask(struct device *dev, From 7ddd8faf4399ab4f4edad5604eab35f8a87caf02 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 13 Oct 2017 15:57:54 -0700 Subject: [PATCH 203/217] userfaultfd: selftest: exercise -EEXIST only in background transfer I was stress testing some backports and with high load, after some time, the latest version of the selftest showed some false positive in connection with the uffdio_copy_retry. This seems to fix it while still exercising -EEXIST in the background transfer once in a while. The fork child will quit after the last UFFDIO_COPY is run, so a repeated UFFDIO_COPY may not return -EEXIST. This change restricts the -EEXIST stress to the background transfer where the memory can't go away from under it. Also updated uffdio_zeropage, so the interface is consistent. Link: http://lkml.kernel.org/r/20171004171541.1495-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Pavel Emelyanov Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 25 +++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index a2c53a3d223d..de2f9ec8a87f 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -397,7 +397,7 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, } } -static int copy_page(int ufd, unsigned long offset) +static int __copy_page(int ufd, unsigned long offset, bool retry) { struct uffdio_copy uffdio_copy; @@ -418,7 +418,7 @@ static int copy_page(int ufd, unsigned long offset) fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", uffdio_copy.copy), exit(1); } else { - if (test_uffdio_copy_eexist) { + if (test_uffdio_copy_eexist && retry) { test_uffdio_copy_eexist = false; retry_copy_page(ufd, &uffdio_copy, offset); } @@ -427,6 +427,16 @@ static int copy_page(int ufd, unsigned long offset) return 0; } +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true); +} + +static int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} + static void *uffd_poll_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -544,7 +554,7 @@ static void *background_thread(void *arg) for (page_nr = cpu * nr_pages_per_cpu; page_nr < (cpu+1) * nr_pages_per_cpu; page_nr++) - copy_page(uffd, page_nr * page_size); + copy_page_retry(uffd, page_nr * page_size); return NULL; } @@ -779,7 +789,7 @@ static void retry_uffdio_zeropage(int ufd, } } -static int uffdio_zeropage(int ufd, unsigned long offset) +static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) { struct uffdio_zeropage uffdio_zeropage; int ret; @@ -814,7 +824,7 @@ static int uffdio_zeropage(int ufd, unsigned long offset) fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", uffdio_zeropage.zeropage), exit(1); } else { - if (test_uffdio_zeropage_eexist) { + if (test_uffdio_zeropage_eexist && retry) { test_uffdio_zeropage_eexist = false; retry_uffdio_zeropage(ufd, &uffdio_zeropage, offset); @@ -830,6 +840,11 @@ static int uffdio_zeropage(int ufd, unsigned long offset) return 0; } +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + return __uffdio_zeropage(ufd, offset, false); +} + /* exercise UFFDIO_ZEROPAGE */ static int userfaultfd_zeropage_test(void) { From 51962a9d437f0d580c04cd2c4abc2bd417200da2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 13 Oct 2017 15:57:58 -0700 Subject: [PATCH 204/217] scripts/kallsyms.c: ignore symbol type 'n' gcc on aarch64 may emit synbols of type 'n' if the kernel is built with '-frecord-gcc-switches'. In most cases, those symbols are reported with nm as 000000000000000e n $d and with objdump as 0000000000000000 l d .GCC.command.line 0000000000000000 .GCC.command.line 000000000000000e l .GCC.command.line 0000000000000000 $d Those symbols are detected in is_arm_mapping_symbol() and ignored. However, if "--prefix-symbols=" is configured as well, the situation is different. For example, in efi/libstub, arm64 images are built with '--prefix-alloc-sections=.init --prefix-symbols=__efistub_'. In combination with '-frecord-gcc-switches', the symbols are now reported by nm as: 000000000000000e n __efistub_$d and by objdump as: 0000000000000000 l d .GCC.command.line 0000000000000000 .GCC.command.line 000000000000000e l .GCC.command.line 0000000000000000 __efistub_$d Those symbols are no longer ignored and included in the base address calculation. This results in a base address of 000000000000000e, which in turn causes kallsyms to abort with kallsyms failure: relative symbol value 0xffffff900800a000 out of range in relative mode The problem is seen in little endian arm64 builds with CONFIG_EFI enabled and with '-frecord-gcc-switches' set in KCFLAGS. Explicitly ignore symbols of type 'n' since those are clearly debug symbols. Link: http://lkml.kernel.org/r/1507136063-3139-1-git-send-email-linux@roeck-us.net Signed-off-by: Guenter Roeck Acked-by: Ard Biesheuvel Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 5d554419170b..9ee9bf7fd1a2 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -158,7 +158,7 @@ static int read_symbol(FILE *in, struct sym_entry *s) else if (str[0] == '$') return -1; /* exclude debugging symbols */ - else if (stype == 'N') + else if (stype == 'N' || stype == 'n') return -1; /* include the type field in the symbol name, so that it gets From ef4650144e76ae361fe4b8c9a0afcd53074cd520 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 13 Oct 2017 15:58:01 -0700 Subject: [PATCH 205/217] mm/cma.c: take __GFP_NOWARN into account in cma_alloc() cma_alloc() unconditionally prints an INFO message when the CMA allocation fails. Make this message conditional on the non-presence of __GFP_NOWARN in gfp_mask. This patch aims at removing INFO messages that are displayed when the VC4 driver tries to allocate buffer objects. From the driver perspective an allocation failure is acceptable, and the driver can possibly do something to make following allocation succeed (like flushing the VC4 internal cache). Link: http://lkml.kernel.org/r/20171004125447.15195-1-boris.brezillon@free-electrons.com Signed-off-by: Boris Brezillon Acked-by: Laura Abbott Cc: Jaewon Kim Cc: David Airlie Cc: Daniel Vetter Cc: Eric Anholt Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index c0da318c020e..022e52bd8370 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -460,7 +460,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret) { + if (ret && !(gfp_mask & __GFP_NOWARN)) { pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); From b8c8a338f75e052d9fa2fed851259320af412e3f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Oct 2017 15:58:05 -0700 Subject: [PATCH 206/217] Revert "vmalloc: back off when the current task is killed" This reverts commits 5d17a73a2ebe ("vmalloc: back off when the current task is killed") and 171012f56127 ("mm: don't warn when vmalloc() fails due to a fatal signal"). Commit 5d17a73a2ebe ("vmalloc: back off when the current task is killed") made all vmalloc allocations from a signal-killed task fail. We have seen crashes in the tty driver from this, where a killed task exiting tries to switch back to N_TTY, fails n_tty_open because of the vmalloc failing, and later crashes when dereferencing tty->disc_data. Arguably, relying on a vmalloc() call to succeed in order to properly exit a task is not the most robust way of doing things. There will be a follow-up patch to the tty code to fall back to the N_NULL ldisc. But the justification to make that vmalloc() call fail like this isn't convincing, either. The patch mentions an OOM victim exhausting the memory reserves and thus deadlocking the machine. But the OOM killer is only one, improbable source of fatal signals. It doesn't make sense to fail allocations preemptively with plenty of memory in most cases. The patch doesn't mention real-life instances where vmalloc sites would exhaust memory, which makes it sound more like a theoretical issue to begin with. But just in case, the OOM access to memory reserves has been restricted on the allocator side in cd04ae1e2dc8 ("mm, oom: do not rely on TIF_MEMDIE for memory reserves access"), which should take care of any theoretical concerns on that front. Revert this patch, and the follow-up that suppresses the allocation warnings when we fail the allocations due to a signal. Link: http://lkml.kernel.org/r/20171004185906.GB2136@cmpxchg.org Fixes: 171012f56127 ("mm: don't warn when vmalloc() fails due to a fatal signal") Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Alan Cox Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8a43db6284eb..673942094328 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1695,11 +1695,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - if (fatal_signal_pending(current)) { - area->nr_pages = i; - goto fail_no_warn; - } - if (node == NUMA_NO_NODE) page = alloc_page(alloc_mask|highmem_mask); else @@ -1723,7 +1718,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); -fail_no_warn: vfree(area->addr); return NULL; } From e65c62b1375cbff69fa925787bcdae4b27bffb48 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Oct 2017 15:58:08 -0700 Subject: [PATCH 207/217] tty: fall back to N_NULL if switching to N_TTY fails during hangup We have seen NULL-pointer dereference crashes in tty->disc_data when the N_TTY fallback driver failed to open during hangup. The immediate cause of this open to fail has been addressed in the preceding patch to vmalloc(), but this code could be more robust. As Alan pointed out in commit 8a8dabf2dd68 ("tty: handle the case where we cannot restore a line discipline"), the N_TTY driver, historically the safe fallback that could never fail, can indeed fail, but the surrounding code is not prepared to handle this. To avoid crashes he added a new N_NULL driver to take N_TTY's place as the last resort. Hook that fallback up to the hangup path. Update tty_ldisc_reinit() to reflect the reality that n_tty_open can indeed fail. Link: http://lkml.kernel.org/r/20171004185959.GC2136@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Alan Cox Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/tty_ldisc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 2fe216b276e2..84a8ac2a779f 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -694,10 +694,8 @@ int tty_ldisc_reinit(struct tty_struct *tty, int disc) tty_set_termios_ldisc(tty, disc); retval = tty_ldisc_open(tty, tty->ldisc); if (retval) { - if (!WARN_ON(disc == N_TTY)) { - tty_ldisc_put(tty->ldisc); - tty->ldisc = NULL; - } + tty_ldisc_put(tty->ldisc); + tty->ldisc = NULL; } return retval; } @@ -752,8 +750,9 @@ void tty_ldisc_hangup(struct tty_struct *tty, bool reinit) if (tty->ldisc) { if (reinit) { - if (tty_ldisc_reinit(tty, tty->termios.c_line) < 0) - tty_ldisc_reinit(tty, N_TTY); + if (tty_ldisc_reinit(tty, tty->termios.c_line) < 0 && + tty_ldisc_reinit(tty, N_TTY) < 0) + WARN_ON(tty_ldisc_reinit(tty, N_NULL) < 0); } else tty_ldisc_kill(tty); } From e8c97af0c1f23d6ffedcaa3918861f2595e1db62 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 13 Oct 2017 15:58:11 -0700 Subject: [PATCH 208/217] linux/kernel.h: add/correct kernel-doc notation Add kernel-doc notation for some macros. Correct kernel-doc comments & typos for a few macros. Link: http://lkml.kernel.org/r/76fa1403-1511-be4c-e9c4-456b43edfad3@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 90 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 0ad4c3044cf9..91189bb0c818 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -44,6 +44,12 @@ #define STACK_MAGIC 0xdeadbeef +/** + * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value + * @x: value to repeat + * + * NOTE: @x is not checked for > 0xff; larger values produce odd results. + */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) /* @a is a power of 2 value */ @@ -57,6 +63,10 @@ #define READ 0 #define WRITE 1 +/** + * ARRAY_SIZE - get the number of elements in array @arr + * @arr: array to be sized + */ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) #define u64_to_user_ptr(x) ( \ @@ -76,7 +86,15 @@ #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) #define round_down(x, y) ((x) & ~__round_mask(x, y)) +/** + * FIELD_SIZEOF - get the size of a struct's field + * @t: the target struct + * @f: the target struct's field + * Return: the size of @f in the struct definition without having a + * declared instance of @t. + */ #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) + #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP #define DIV_ROUND_DOWN_ULL(ll, d) \ @@ -107,7 +125,7 @@ /* * Divide positive or negative dividend by positive or negative divisor * and round to closest integer. Result is undefined for negative - * divisors if he dividend variable type is unsigned and for negative + * divisors if the dividend variable type is unsigned and for negative * dividends if the divisor variable type is unsigned. */ #define DIV_ROUND_CLOSEST(x, divisor)( \ @@ -247,13 +265,13 @@ extern int _cond_resched(void); * @ep_ro: right open interval endpoint * * Perform a "reciprocal multiplication" in order to "scale" a value into - * range [0, ep_ro), where the upper interval endpoint is right-open. + * range [0, @ep_ro), where the upper interval endpoint is right-open. * This is useful, e.g. for accessing a index of an array containing - * ep_ro elements, for example. Think of it as sort of modulus, only that + * @ep_ro elements, for example. Think of it as sort of modulus, only that * the result isn't that of modulo. ;) Note that if initial input is a * small value, then result will return 0. * - * Return: a result based on val in interval [0, ep_ro). + * Return: a result based on @val in interval [0, @ep_ro). */ static inline u32 reciprocal_scale(u32 val, u32 ep_ro) { @@ -618,8 +636,8 @@ do { \ * trace_printk - printf formatting in the ftrace buffer * @fmt: the printf format for printing * - * Note: __trace_printk is an internal function for trace_printk and - * the @ip is passed in via the trace_printk macro. + * Note: __trace_printk is an internal function for trace_printk() and + * the @ip is passed in via the trace_printk() macro. * * This function allows a kernel developer to debug fast path sections * that printk is not appropriate for. By scattering in various @@ -629,7 +647,7 @@ do { \ * This is intended as a debugging tool for the developer only. * Please refrain from leaving trace_printks scattered around in * your code. (Extra memory is used for special buffers that are - * allocated when trace_printk() is used) + * allocated when trace_printk() is used.) * * A little optization trick is done here. If there's only one * argument, there's no need to scan the string for printf formats. @@ -681,7 +699,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); * the @ip is passed in via the trace_puts macro. * * This is similar to trace_printk() but is made for those really fast - * paths that a developer wants the least amount of "Heisenbug" affects, + * paths that a developer wants the least amount of "Heisenbug" effects, * where the processing of the print format is still too much. * * This function allows a kernel developer to debug fast path sections @@ -692,7 +710,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); * This is intended as a debugging tool for the developer only. * Please refrain from leaving trace_puts scattered around in * your code. (Extra memory is used for special buffers that are - * allocated when trace_puts() is used) + * allocated when trace_puts() is used.) * * Returns: 0 if nothing was written, positive # if string was. * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) @@ -771,6 +789,12 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } t2 min2 = (y); \ (void) (&min1 == &min2); \ min1 < min2 ? min1 : min2; }) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ #define min(x, y) \ __min(typeof(x), typeof(y), \ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ @@ -781,12 +805,31 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } t2 max2 = (y); \ (void) (&max1 == &max2); \ max1 > max2 ? max1 : max2; }) + +/** + * max - return maximum of two values of the same or compatible types + * @x: first value + * @y: second value + */ #define max(x, y) \ __max(typeof(x), typeof(y), \ __UNIQUE_ID(max1_), __UNIQUE_ID(max2_), \ x, y) +/** + * min3 - return minimum of three values + * @x: first value + * @y: second value + * @z: third value + */ #define min3(x, y, z) min((typeof(x))min(x, y), z) + +/** + * max3 - return maximum of three values + * @x: first value + * @y: second value + * @z: third value + */ #define max3(x, y, z) max((typeof(x))max(x, y), z) /** @@ -805,8 +848,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * @lo: lowest allowable value * @hi: highest allowable value * - * This macro does strict typechecking of lo/hi to make sure they are of the - * same type as val. See the unnecessary pointer comparisons. + * This macro does strict typechecking of @lo/@hi to make sure they are of the + * same type as @val. See the unnecessary pointer comparisons. */ #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) @@ -816,11 +859,24 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * * Or not use min/max/clamp at all, of course. */ + +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ #define min_t(type, x, y) \ __min(type, type, \ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ x, y) +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ #define max_t(type, x, y) \ __max(type, type, \ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ @@ -834,7 +890,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * @hi: maximum allowable value * * This macro does no typechecking and uses temporary variables of type - * 'type' to make all the comparisons. + * @type to make all the comparisons. */ #define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) @@ -845,15 +901,17 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * @hi: maximum allowable value * * This macro does no typechecking and uses temporary variables of whatever - * type the input argument 'val' is. This is useful when val is an unsigned - * type and min and max are literals that will otherwise be assigned a signed + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed * integer type. */ #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) -/* - * swap - swap value of @a and @b +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value */ #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) From f892760aa66a2d657deaf59538fb69433036767c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 13 Oct 2017 15:58:15 -0700 Subject: [PATCH 209/217] fs/mpage.c: fix mpage_writepage() for pages with buffers When using FAT on a block device which supports rw_page, we can hit BUG_ON(!PageLocked(page)) in try_to_free_buffers(). This is because we call clean_buffers() after unlocking the page we've written. Introduce a new clean_page_buffers() which cleans all buffers associated with a page and call it from within bdev_write_page(). [akpm@linux-foundation.org: s/PAGE_SIZE/~0U/ per Linus and Matthew] Link: http://lkml.kernel.org/r/20171006211541.GA7409@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reported-by: Toshi Kani Reported-by: OGAWA Hirofumi Tested-by: Toshi Kani Acked-by: Johannes Thumshirn Cc: Ross Zwisler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 6 ++++-- fs/mpage.c | 14 +++++++++++--- include/linux/buffer_head.h | 1 + 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 93d088ffc05c..789f55e851ae 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -716,10 +716,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); - if (result) + if (result) { end_page_writeback(page); - else + } else { + clean_page_buffers(page); unlock_page(page); + } blk_queue_exit(bdev->bd_queue); return result; } diff --git a/fs/mpage.c b/fs/mpage.c index 37bb77c1302c..c991faec70b9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -468,6 +468,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) try_to_free_buffers(page); } +/* + * For situations where we want to clean all buffers attached to a page. + * We don't need to calculate how many buffers are attached to the page, + * we just need to specify a number larger than the maximum number of buffers. + */ +void clean_page_buffers(struct page *page) +{ + clean_buffers(page, ~0U); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -605,10 +615,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) { - clean_buffers(page, first_unmapped); + page, wbc)) goto out; - } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae555eccf..446b24cac67d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -232,6 +232,7 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); +void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t *, loff_t *); From 7e86600606cef21beec725039d70377fb364f881 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Fri, 13 Oct 2017 15:58:18 -0700 Subject: [PATCH 210/217] fs/binfmt_misc.c: node could be NULL when evicting inode inode->i_private is assigned by a Node pointer only after registering a new binary format, so it could be NULL if inode was created by bm_fill_super() (or iput() was called by the error path in bm_register_write()), and this could result in NULL pointer dereference when evicting such an inode. e.g. mount binfmt_misc filesystem then umount it immediately: mount -t binfmt_misc binfmt_misc /proc/sys/fs/binfmt_misc umount /proc/sys/fs/binfmt_misc will result in BUG: unable to handle kernel NULL pointer dereference at 0000000000000013 IP: bm_evict_inode+0x16/0x40 [binfmt_misc] ... Call Trace: evict+0xd3/0x1a0 iput+0x17d/0x1d0 dentry_unlink_inode+0xb9/0xf0 __dentry_kill+0xc7/0x170 shrink_dentry_list+0x122/0x280 shrink_dcache_parent+0x39/0x90 do_one_tree+0x12/0x40 shrink_dcache_for_umount+0x2d/0x90 generic_shutdown_super+0x1f/0x120 kill_litter_super+0x29/0x40 deactivate_locked_super+0x43/0x70 deactivate_super+0x45/0x60 cleanup_mnt+0x3f/0x70 __cleanup_mnt+0x12/0x20 task_work_run+0x86/0xa0 exit_to_usermode_loop+0x6d/0x99 syscall_return_slowpath+0xba/0xf0 entry_SYSCALL_64_fastpath+0xa3/0xa Fix it by making sure Node (e) is not NULL. Link: http://lkml.kernel.org/r/20171010100642.31786-1-eguan@redhat.com Fixes: 83f918274e4b ("exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode()") Signed-off-by: Eryu Guan Acked-by: Oleg Nesterov Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 2a46762def31..a7c5a9861bef 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if (e->flags & MISC_FMT_OPEN_FILE) + if (e && e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); clear_inode(inode); From ca182551857cc2c1e6a2b7f1e72090a137a15008 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 13 Oct 2017 15:58:22 -0700 Subject: [PATCH 211/217] kmemleak: clear stale pointers from task stacks Kmemleak considers any pointers on task stacks as references. This patch clears newly allocated and reused vmap stacks. Link: http://lkml.kernel.org/r/150728990124.744199.8403409836394318684.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 +- kernel/fork.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 905d769d8ddc..5f7eeab990fe 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,7 +42,7 @@ enum { #define THREAD_ALIGN THREAD_SIZE #endif -#ifdef CONFIG_DEBUG_STACK_USAGE +#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) # define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ __GFP_ZERO) #else diff --git a/kernel/fork.c b/kernel/fork.c index e702cb9ffbd8..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; +#ifdef CONFIG_DEBUG_KMEMLEAK + /* Clear stale pointers from reused stack. */ + memset(s->addr, 0, THREAD_SIZE); +#endif tsk->stack_vm_area = s; return s->addr; } From a7b100953aa33a5bbdc3e5e7f2241b9c0704606e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Oct 2017 15:58:25 -0700 Subject: [PATCH 212/217] mm: page_vma_mapped: ensure pmd is loaded with READ_ONCE outside of lock Loading the pmd without holding the pmd_lock exposes us to races with concurrent updaters of the page tables but, worse still, it also allows the compiler to cache the pmd value in a register and reuse it later on, even if we've performed a READ_ONCE in between and seen a more recent value. In the case of page_vma_mapped_walk, this leads to the following crash when the pmd loaded for the initial pmd_trans_huge check is all zeroes and a subsequent valid table entry is loaded by check_pmd. We then proceed into map_pte, but the compiler re-uses the zero entry inside pte_offset_map, resulting in a junk pointer being installed in pvmw->pte: PC is at check_pte+0x20/0x170 LR is at page_vma_mapped_walk+0x2e0/0x540 [...] Process doio (pid: 2463, stack limit = 0xffff00000f2e8000) Call trace: check_pte+0x20/0x170 page_vma_mapped_walk+0x2e0/0x540 page_mkclean_one+0xac/0x278 rmap_walk_file+0xf0/0x238 rmap_walk+0x64/0xa0 page_mkclean+0x90/0xa8 clear_page_dirty_for_io+0x84/0x2a8 mpage_submit_page+0x34/0x98 mpage_process_page_bufs+0x164/0x170 mpage_prepare_extent_to_map+0x134/0x2b8 ext4_writepages+0x484/0xe30 do_writepages+0x44/0xe8 __filemap_fdatawrite_range+0xbc/0x110 file_write_and_wait_range+0x48/0xd8 ext4_sync_file+0x80/0x4b8 vfs_fsync_range+0x64/0xc0 SyS_msync+0x194/0x1e8 This patch fixes the problem by ensuring that READ_ONCE is used before the initial checks on the pmd, and this value is subsequently used when checking whether or not the pmd is present. pmd_check is removed and the pmd_present check is inlined directly. Link: http://lkml.kernel.org/r/1507222630-5839-1-git-send-email-will.deacon@arm.com Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") Signed-off-by: Will Deacon Tested-by: Yury Norov Tested-by: Richard Ruigrok Acked-by: Kirill A. Shutemov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_vma_mapped.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index eb462e7db0a9..53afbb919a1c 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -6,17 +6,6 @@ #include "internal.h" -static inline bool check_pmd(struct page_vma_mapped_walk *pvmw) -{ - pmd_t pmde; - /* - * Make sure we don't re-load pmd between present and !trans_huge check. - * We need a consistent view. - */ - pmde = READ_ONCE(*pvmw->pmd); - return pmd_present(pmde) && !pmd_trans_huge(pmde); -} - static inline bool not_found(struct page_vma_mapped_walk *pvmw) { page_vma_mapped_walk_done(pvmw); @@ -116,6 +105,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) pgd_t *pgd; p4d_t *p4d; pud_t *pud; + pmd_t pmde; /* The only possible pmd mapping has been handled on last iteration */ if (pvmw->pmd && !pvmw->pte) @@ -148,7 +138,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (!pud_present(*pud)) return false; pvmw->pmd = pmd_offset(pud, pvmw->address); - if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) { + /* + * Make sure the pmd value isn't cached in a register by the + * compiler and used as a stale value after we've observed a + * subsequent update. + */ + pmde = READ_ONCE(*pvmw->pmd); + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); if (likely(pmd_trans_huge(*pvmw->pmd))) { if (pvmw->flags & PVMW_MIGRATION) @@ -174,9 +170,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } - } else { - if (!check_pmd(pvmw)) - return false; + } else if (!pmd_present(pmde)) { + return false; } if (!map_pte(pvmw)) goto next_pte; From 61b639723be5a9fc4812d5d85cb769589afa5a38 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 13 Oct 2017 15:58:29 -0700 Subject: [PATCH 213/217] mm, swap: use page-cluster as max window of VMA based swap readahead When the VMA based swap readahead was introduced, a new knob /sys/kernel/mm/swap/vma_ra_max_order was added as the max window of VMA swap readahead. This is to make it possible to use different max window for VMA based readahead and original physical readahead. But Minchan Kim pointed out that this will cause a regression because setting page-cluster sysctl to zero cannot disable swap readahead with the change. To fix the regression, the page-cluster sysctl is used as the max window of both the VMA based swap readahead and original physical swap readahead. If more fine grained control is needed in the future, more knobs can be added as the subordinate knobs of the page-cluster sysctl. The vma_ra_max_order knob is deleted. Because the knob was introduced in v4.14-rc1, and this patch is targeting being merged before v4.14 releasing, there should be no existing users of this newly added ABI. Link: http://lkml.kernel.org/r/20171011070847.16003-1-ying.huang@intel.com Fixes: ec560175c0b6fce ("mm, swap: VMA based swap readahead") Signed-off-by: "Huang, Ying" Reported-by: Minchan Kim Acked-by: Minchan Kim Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-kernel-mm-swap | 10 ----- mm/swap_state.c | 41 ++++--------------- 2 files changed, 7 insertions(+), 44 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap index 587db52084c7..94672016c268 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-swap +++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap @@ -14,13 +14,3 @@ Description: Enable/disable VMA based swap readahead. still used for tmpfs etc. other users. If set to false, the global swap readahead algorithm will be used for all swappable pages. - -What: /sys/kernel/mm/swap/vma_ra_max_order -Date: August 2017 -Contact: Linux memory management mailing list -Description: The max readahead size in order for VMA based swap readahead - - VMA based swap readahead algorithm will readahead at - most 1 << max_order pages for each readahead. The - real readahead size for each readahead will be scaled - according to the estimation algorithm. diff --git a/mm/swap_state.c b/mm/swap_state.c index ed91091d1e68..05b6803f0cce 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,10 +39,6 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; bool swap_vma_readahead = true; -#define SWAP_RA_MAX_ORDER_DEFAULT 3 - -static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; - #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK @@ -664,6 +660,13 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, pte_t *tpte; #endif + max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), + SWAP_RA_ORDER_CEILING); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + faddr = vmf->address; entry = pte_to_swp_entry(vmf->orig_pte); if ((unlikely(non_swap_entry(entry)))) @@ -672,12 +675,6 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, if (page) return page; - max_win = 1 << READ_ONCE(swap_ra_max_order); - if (max_win == 1) { - swap_ra->win = 1; - return NULL; - } - fpfn = PFN_DOWN(faddr); swap_ra_info = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); @@ -786,32 +783,8 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, vma_ra_enabled_store); -static ssize_t vma_ra_max_order_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", swap_ra_max_order); -} -static ssize_t vma_ra_max_order_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err, v; - - err = kstrtoint(buf, 10, &v); - if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) - return -EINVAL; - - swap_ra_max_order = v; - - return count; -} -static struct kobj_attribute vma_ra_max_order_attr = - __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, - vma_ra_max_order_store); - static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, - &vma_ra_max_order_attr.attr, NULL, }; From b956575bed91ecfb136a8300742ecbbf451471ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 9 Oct 2017 09:50:49 -0700 Subject: [PATCH 214/217] x86/mm: Flush more aggressively in lazy TLB mode Since commit: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") x86's lazy TLB mode has been all the way lazy: when running a kernel thread (including the idle thread), the kernel keeps using the last user mm's page tables without attempting to maintain user TLB coherence at all. From a pure semantic perspective, this is fine -- kernel threads won't attempt to access user pages, so having stale TLB entries doesn't matter. Unfortunately, I forgot about a subtlety. By skipping TLB flushes, we also allow any paging-structure caches that may exist on the CPU to become incoherent. This means that we can have a paging-structure cache entry that references a freed page table, and the CPU is within its rights to do a speculative page walk starting at the freed page table. I can imagine this causing two different problems: - A speculative page walk starting from a bogus page table could read IO addresses. I haven't seen any reports of this causing problems. - A speculative page walk that involves a bogus page table can install garbage in the TLB. Such garbage would always be at a user VA, but some AMD CPUs have logic that triggers a machine check when it notices these bogus entries. I've seen a couple reports of this. Boris further explains the failure mode: > It is actually more of an optimization which assumes that paging-structure > entries are in WB DRAM: > > "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables > performance optimization that assumes PML4, PDP, PDE, and PTE entries > are in cacheable WB-DRAM; memory type checks may be bypassed, and > addresses outside of WB-DRAM may result in undefined behavior or NB > protocol errors. 1=Disables performance optimization and allows PML4, > PDP, PDE and PTE entries to be in any memory type. Operating systems > that maintain page tables in memory types other than WB- DRAM must set > TlbCacheDis to insure proper operation." > > The MCE generated is an NB protocol error to signal that > > "Link: A specific coherent-only packet from a CPU was issued to an > IO link. This may be caused by software which addresses page table > structures in a memory type other than cacheable WB-DRAM without > properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for > example, when page table structure addresses are above top of memory. In > such cases, the NB will generate an MCE if it sees a mismatch between > the memory operation generated by the core and the link type." > > I'm assuming coherent-only packets don't go out on IO links, thus the > error. To fix this, reinstate TLB coherence in lazy mode. With this patch applied, we do it in one of two ways: - If we have PCID, we simply switch back to init_mm's page tables when we enter a kernel thread -- this seems to be quite cheap except for the cost of serializing the CPU. - If we don't have PCID, then we set a flag and switch to init_mm the first time we would otherwise need to flush the TLB. The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed to override the default mode for benchmarking. In theory, we could optimize this better by only flushing the TLB in lazy CPUs when a page table is freed. Doing that would require auditing the mm code to make sure that all page table freeing goes through tlb_remove_page() as well as reworking some data structures to implement the improved flush logic. Reported-by: Markus Trippelsdorf Reported-by: Adam Borowski Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Borkmann Cc: Eric Biggers Cc: Johannes Hirte Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Roman Kagan Cc: Thomas Gleixner Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 8 +- arch/x86/include/asm/tlbflush.h | 24 +++++ arch/x86/mm/tlb.c | 153 +++++++++++++++++++++-------- 3 files changed, 136 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c120b5db178a..3c856a15b98e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_clear_cpu(cpu, mm_cpumask(mm)); -} +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4893abf7f74f..d362161d3291 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If + * it's false, then we immediately switch CR3 when entering a kernel thread. + */ +DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + /* * 6 because 6 should be plenty and struct tlb_state will fit in * two cache lines. @@ -104,6 +111,23 @@ struct tlb_state { u16 loaded_mm_asid; u16 next_asid; + /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49d9778376d7..658bf0090565 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,6 +30,8 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) { @@ -80,7 +82,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif + this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * There's nothing to do: we weren't lazy, and we - * aren't changing our mm. We don't need to flush - * anything, nor do we need to update CR3, CR4, or - * LDTR. - */ - return; - } - - /* Resume remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < - next_tlb_gen) { - /* - * Ideally, we'd have a flush_tlb() variant that - * takes the known CR3 value as input. This would - * be faster on Xen PV and on hypothetical CPUs - * on which INVPCID is fast. - */ - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); - write_cr3(build_cr3(next, prev_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); - } - /* - * We just exited lazy mode, which means that CR4 and/or LDTR - * may be stale. (Changes to the required CR4 and LDTR states - * are not reflected in tlb_gen.) + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; } else { u16 new_asid; bool need_flush; @@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* Stop remote flushes for the previous mm */ - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. @@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_ldt(real_prev, next); } +/* + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (static_branch_unlikely(&tlb_use_lazy_mode)) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} + /* * Call this when reinitializing a CPU. It fixes the following potential * problems: @@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (unlikely(loaded_mm == &init_mm)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + if (this_cpu_read(cpu_tlbstate.is_lazy)) { /* - * We're in lazy mode -- don't flush. We can get here on - * remote flushes due to races and on local flushes if a - * kernel thread coincidentally flushes the mm it's lazily - * still using. + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. */ + switch_mm_irqs_off(NULL, &init_mm, NULL); return; } @@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[2]; + + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; + buf[1] = '\n'; + + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t tlblazy_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + bool val; + + if (kstrtobool_from_user(user_buf, count, &val)) + return -EINVAL; + + if (val) + static_branch_enable(&tlb_use_lazy_mode); + else + static_branch_disable(&tlb_use_lazy_mode); + + return count; +} + +static const struct file_operations fops_tlblazy = { + .read = tlblazy_read_file, + .write = tlblazy_write_file, + .llseek = default_llseek, +}; + +static int __init init_tlb_use_lazy_mode(void) +{ + if (boot_cpu_has(X86_FEATURE_PCID)) { + /* + * Heuristic: with PCID on, switching to and from + * init_mm is reasonably fast, but remote flush IPIs + * as expensive as ever, so turn off lazy TLB mode. + * + * We can't do this in setup_pcid() because static keys + * haven't been initialized yet, and it would blow up + * badly. + */ + static_branch_disable(&tlb_use_lazy_mode); + } + + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlblazy); + return 0; +} +late_initcall(init_tlb_use_lazy_mode); From b483cf3bc249d7af706390efa63d6671e80d1c09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Oct 2017 09:26:59 +0200 Subject: [PATCH 215/217] locking/lockdep: Disable cross-release features for now Johan Hovold reported a big lockdep slowdown on his system, caused by lockdep: > I had noticed that the BeagleBone Black boot time appeared to have > increased significantly with 4.14 and yesterday I finally had time to > investigate it. > > Boot time (from "Linux version" to login prompt) had in fact doubled > since 4.13 where it took 17 seconds (with my current config) compared to > the 35 seconds I now see with 4.14-rc4. > > I quick bisect pointed to lockdep and specifically the following commit: > > 28a903f63ec0 ("locking/lockdep: Handle non(or multi)-acquisition of a crosslock") Because the final v4.14 release is close, disable the cross-release lockdep features for now. Bisected-by: Johan Hovold Debugged-by: Johan Hovold Reported-by: Johan Hovold Cc: Arnd Bergmann Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Lindgren Cc: kernel-team@lge.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mm@kvack.org Cc: linux-omap@vger.kernel.org Link: http://lkml.kernel.org/r/20171014072659.f2yr6mhm5ha3eou7@gmail.com Signed-off-by: Ingo Molnar --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2689b7c50c52..e2705843c524 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1092,8 +1092,8 @@ config PROVE_LOCKING select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC - select LOCKDEP_CROSSRELEASE - select LOCKDEP_COMPLETIONS + select LOCKDEP_CROSSRELEASE if BROKEN + select LOCKDEP_COMPLETIONS if BROKEN select TRACE_IRQFLAGS default n help From 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 12 Oct 2017 13:23:16 +0200 Subject: [PATCH 216/217] x86/microcode: Do the family check first On CPUs like AMD's Geode, for example, we shouldn't even try to load microcode because they do not support the modern microcode loading interface. However, we do the family check *after* the other checks whether the loader has been disabled on the command line or whether we're running in a guest. So move the family checks first in order to exit early if we're being loaded on an unsupported family. Reported-and-tested-by: Sven Glodowski Signed-off-by: Borislav Petkov Cc: # 4.11.. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396 Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 86e8f0b2537b..c4fa4a85d4cb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but @@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; cpuid_1_eax = native_cpuid_eax(1); switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (x86_family(cpuid_1_eax) >= 6) - load_ucode_intel_bsp(); + if (x86_family(cpuid_1_eax) < 6) + return; break; + case X86_VENDOR_AMD: - if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_bsp(cpuid_1_eax); + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); } static bool check_loader_disabled_ap(void) From 33d930e59a98fa10a0db9f56c7fa2f21a4aef9b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Oct 2017 21:01:12 -0400 Subject: [PATCH 217/217] Linux 4.14-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5bf6fa4d62d8..46bfb0ed2257 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Fearless Coyote # *DOCUMENTATION*