From 291b5c9870fc546376d69cf792b7885cd0c9c1b3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 21 Dec 2020 19:59:31 -0700 Subject: [PATCH 01/69] i3c/master/mipi-i3c-hci: Fix position of __maybe_unused in i3c_hci_of_match Clang warns: ../drivers/i3c/master/mipi-i3c-hci/core.c:780:21: warning: attribute declaration must precede definition [-Wignored-attributes] static const struct __maybe_unused of_device_id i3c_hci_of_match[] = { ^ ../include/linux/compiler_attributes.h:267:56: note: expanded from macro '__maybe_unused' #define __maybe_unused __attribute__((__unused__)) ^ ../include/linux/mod_devicetable.h:262:8: note: previous definition is here struct of_device_id { ^ 1 warning generated. 'struct of_device_id' should not be split, as it is a type. Move the __maybe_unused attribute after the static and const qualifiers so that there are no warnings about this variable, period. Fixes: 95393f3e07ab ("i3c/master/mipi-i3c-hci: quiet maybe-unused variable warning") Link: https://github.com/ClangBuiltLinux/linux/issues/1221 Signed-off-by: Nathan Chancellor Acked-by: Nicolas Pitre Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201222025931.3043480-1-natechancellor@gmail.com --- drivers/i3c/master/mipi-i3c-hci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 500abd27fb22..1b73647cc3b1 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -777,7 +777,7 @@ static int i3c_hci_remove(struct platform_device *pdev) return 0; } -static const struct __maybe_unused of_device_id i3c_hci_of_match[] = { +static const __maybe_unused struct of_device_id i3c_hci_of_match[] = { { .compatible = "mipi-i3c-hci", }, {}, }; From 16e19e11228ba660d9e322035635e7dcf160d5c2 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 15 Jan 2021 14:52:52 -0700 Subject: [PATCH 02/69] dmaengine: idxd: Fix list corruption in description completion Sanjay reported the following kernel splat after running dmatest for stress testing. The current code is giving up the spinlock in the middle of a completion list walk, and that opens up opportunity for list corruption if another thread touches the list at the same time. In order to make sure the list is always protected, the hardware completed descriptors will be put on a local list to be completed with callbacks from the outside of the list lock. kernel: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI kernel: CPU: 62 PID: 1814 Comm: irq/89-idxd-por Tainted: G W 5.10.0-intel-next_10_16+ #1 kernel: Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SBT.4915.D02.2012070418 12/07/2020 kernel: RIP: 0010:irq_process_work_list+0xcd/0x170 [idxd] kernel: Code: e8 18 65 5c d3 8b 45 d4 85 c0 75 b3 4c 89 f7 e8 b9 fe ff ff 84 c0 74 bf 4c 89 e7 e8 dd 6b 5c d3 49 8b 3f 49 8b 4f 08 48 89 c6 <48> 89 4f 08 48 89 39 4c 89 e7 48 b9 00 01 00 00 00 00 ad de 49 89 kernel: RSP: 0018:ff256768c4353df8 EFLAGS: 00010046 kernel: RAX: 0000000000000202 RBX: dead000000000100 RCX: dead000000000122 kernel: RDX: 0000000000000001 RSI: 0000000000000202 RDI: dead000000000100 kernel: RBP: ff256768c4353e40 R08: 0000000000000001 R09: 0000000000000000 kernel: R10: 0000000000000000 R11: 00000000ffffffff R12: ff1fdf9fd06b3e48 kernel: R13: 0000000000000005 R14: ff1fdf9fc4275980 R15: ff1fdf9fc4275a00 kernel: FS: 0000000000000000(0000) GS:ff1fdfa32f880000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007f87f24012a0 CR3: 000000010f630004 CR4: 0000000003771ee0 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 kernel: DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 kernel: PKRU: 55555554 kernel: Call Trace: kernel: ? irq_thread+0xa9/0x1b0 kernel: idxd_wq_thread+0x34/0x90 [idxd] kernel: irq_thread_fn+0x24/0x60 kernel: irq_thread+0x10f/0x1b0 kernel: ? irq_forced_thread_fn+0x80/0x80 kernel: ? wake_threads_waitq+0x30/0x30 kernel: ? irq_thread_check_affinity+0xe0/0xe0 kernel: kthread+0x142/0x160 kernel: ? kthread_park+0x90/0x90 kernel: ret_from_fork+0x1f/0x30 kernel: Modules linked in: idxd_ktest dmatest intel_rapl_msr idxd_mdev iTCO_wdt vfio_pci vfio_virqfd iTCO_vendor_support intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper rapl msr pcspkr snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_intel snd_intel_dspcfg ofpart snd_hda_codec snd_hda_core snd_hwdep cmdlinepart snd_seq snd_seq_device idxd snd_pcm intel_spi_pci intel_spi snd_timer spi_nor input_leds joydev snd i2c_i801 mtd soundcore i2c_smbus mei_me mei i2c_ismt ipmi_ssif acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler mac_hid sunrpc nls_iso8859_1 sch_fq_codel ip_tables x_tables xfs libcrc32c ast drm_vram_helper drm_ttm_helper ttm igc wmi pinctrl_sunrisepoint hid_generic usbmouse usbkbd usbhid hid kernel: ---[ end trace cd5ca950ef0db25f ]--- Reported-by: Sanjay Kumar Signed-off-by: Dave Jiang Tested-by: Sanjay Kumar Fixes: e4f4d8cdeb9a ("dmaengine: idxd: Clean up descriptors with fault error") Link: https://lore.kernel.org/r/161074757267.2183951.17912830858060607266.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/irq.c | 86 +++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 593a2f6ed16c..fe996cec7c74 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -219,29 +219,27 @@ irqreturn_t idxd_misc_thread(int vec, void *data) return IRQ_HANDLED; } -static bool process_fault(struct idxd_desc *desc, u64 fault_addr) +static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr) { /* * Completion address can be bad as well. Check fault address match for descriptor * and completion address. */ - if ((u64)desc->hw == fault_addr || - (u64)desc->completion == fault_addr) { - idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL); + if ((u64)desc->hw == fault_addr || (u64)desc->completion == fault_addr) { + struct idxd_device *idxd = desc->wq->idxd; + struct device *dev = &idxd->pdev->dev; + + dev_warn(dev, "desc with fault address: %#llx\n", fault_addr); return true; } return false; } -static bool complete_desc(struct idxd_desc *desc) +static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason) { - if (desc->completion->status) { - idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL); - return true; - } - - return false; + idxd_dma_complete_txd(desc, reason); + idxd_free_desc(desc->wq, desc); } static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, @@ -251,25 +249,25 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, struct idxd_desc *desc, *t; struct llist_node *head; int queued = 0; - bool completed = false; unsigned long flags; + enum idxd_complete_type reason; *processed = 0; head = llist_del_all(&irq_entry->pending_llist); if (!head) goto out; - llist_for_each_entry_safe(desc, t, head, llnode) { - if (wtype == IRQ_WORK_NORMAL) - completed = complete_desc(desc); - else if (wtype == IRQ_WORK_PROCESS_FAULT) - completed = process_fault(desc, data); + if (wtype == IRQ_WORK_NORMAL) + reason = IDXD_COMPLETE_NORMAL; + else + reason = IDXD_COMPLETE_DEV_FAIL; - if (completed) { - idxd_free_desc(desc->wq, desc); + llist_for_each_entry_safe(desc, t, head, llnode) { + if (desc->completion->status) { + if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS) + match_fault(desc, data); + complete_desc(desc, reason); (*processed)++; - if (wtype == IRQ_WORK_PROCESS_FAULT) - break; } else { spin_lock_irqsave(&irq_entry->list_lock, flags); list_add_tail(&desc->list, @@ -287,42 +285,46 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry, enum irq_work_type wtype, int *processed, u64 data) { - struct list_head *node, *next; int queued = 0; - bool completed = false; unsigned long flags; + LIST_HEAD(flist); + struct idxd_desc *desc, *n; + enum idxd_complete_type reason; *processed = 0; + if (wtype == IRQ_WORK_NORMAL) + reason = IDXD_COMPLETE_NORMAL; + else + reason = IDXD_COMPLETE_DEV_FAIL; + + /* + * This lock protects list corruption from access of list outside of the irq handler + * thread. + */ spin_lock_irqsave(&irq_entry->list_lock, flags); - if (list_empty(&irq_entry->work_list)) - goto out; - - list_for_each_safe(node, next, &irq_entry->work_list) { - struct idxd_desc *desc = - container_of(node, struct idxd_desc, list); - + if (list_empty(&irq_entry->work_list)) { spin_unlock_irqrestore(&irq_entry->list_lock, flags); - if (wtype == IRQ_WORK_NORMAL) - completed = complete_desc(desc); - else if (wtype == IRQ_WORK_PROCESS_FAULT) - completed = process_fault(desc, data); + return 0; + } - if (completed) { - spin_lock_irqsave(&irq_entry->list_lock, flags); + list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) { + if (desc->completion->status) { list_del(&desc->list); - spin_unlock_irqrestore(&irq_entry->list_lock, flags); - idxd_free_desc(desc->wq, desc); (*processed)++; - if (wtype == IRQ_WORK_PROCESS_FAULT) - return queued; + list_add_tail(&desc->list, &flist); } else { queued++; } - spin_lock_irqsave(&irq_entry->list_lock, flags); } - out: spin_unlock_irqrestore(&irq_entry->list_lock, flags); + + list_for_each_entry(desc, &flist, list) { + if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS) + match_fault(desc, data); + complete_desc(desc, reason); + } + return queued; } From f5cc9ace24fbdf41b4814effbb2f9bad7046e988 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 15 Jan 2021 14:52:33 -0700 Subject: [PATCH 03/69] dmaengine: idxd: fix misc interrupt completion Nikhil reported the misc interrupt handler can sometimes miss handling the command interrupt when an error interrupt happens near the same time. Have the irq handling thread continue to process the misc interrupts until all interrupts are processed. This is a low usage interrupt and is not expected to handle high volume traffic. Therefore there is no concern of this thread running for a long time. Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support") Reported-by: Nikhil Rao Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161074755329.2183844.13295528344116907983.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/irq.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index fe996cec7c74..a60ca11a5784 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -111,19 +111,14 @@ irqreturn_t idxd_irq_handler(int vec, void *data) return IRQ_WAKE_THREAD; } -irqreturn_t idxd_misc_thread(int vec, void *data) +static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) { - struct idxd_irq_entry *irq_entry = data; - struct idxd_device *idxd = irq_entry->idxd; struct device *dev = &idxd->pdev->dev; union gensts_reg gensts; - u32 cause, val = 0; + u32 val = 0; int i; bool err = false; - cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET); - iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET); - if (cause & IDXD_INTC_ERR) { spin_lock_bh(&idxd->dev_lock); for (i = 0; i < 4; i++) @@ -181,7 +176,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data) val); if (!err) - goto out; + return 0; /* * This case should rarely happen and typically is due to software @@ -211,10 +206,33 @@ irqreturn_t idxd_misc_thread(int vec, void *data) gensts.reset_type == IDXD_DEVICE_RESET_FLR ? "FLR" : "system reset"); spin_unlock_bh(&idxd->dev_lock); + return -ENXIO; } } - out: + return 0; +} + +irqreturn_t idxd_misc_thread(int vec, void *data) +{ + struct idxd_irq_entry *irq_entry = data; + struct idxd_device *idxd = irq_entry->idxd; + int rc; + u32 cause; + + cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET); + if (cause) + iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET); + + while (cause) { + rc = process_misc_interrupts(idxd, cause); + if (rc < 0) + break; + cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET); + if (cause) + iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET); + } + idxd_unmask_msix_vector(idxd, irq_entry->id); return IRQ_HANDLED; } From e594443196d6e0ef3d3b30320c49b3a4d4f9a547 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 18 Jan 2021 10:28:44 -0700 Subject: [PATCH 04/69] dmaengine: move channel device_node deletion to driver Channel device_node deletion is managed by the device driver rather than the dmaengine core. The deletion was accidentally introduced when making channel unregister dynamic. It causes xilinx_dma module to crash on unload as reported by Radhey. Remove chan->device_node delete in dmaengine and also fix up idxd driver. [ 42.142705] Internal error: Oops: 96000044 [#1] SMP [ 42.147566] Modules linked in: xilinx_dma(-) clk_xlnx_clock_wizard uio_pdrv_genirq [ 42.155139] CPU: 1 PID: 2075 Comm: rmmod Not tainted 5.10.1-00026-g3a2e6dd7a05-dirty #192 [ 42.163302] Hardware name: Enclustra XU5 SOM (DT) [ 42.167992] pstate: 40000005 (nZcv daif -PAN -UAO -TCO BTYPE=--) [ 42.173996] pc : xilinx_dma_chan_remove+0x74/0xa0 [xilinx_dma] [ 42.179815] lr : xilinx_dma_chan_remove+0x70/0xa0 [xilinx_dma] [ 42.185636] sp : ffffffc01112bca0 [ 42.188935] x29: ffffffc01112bca0 x28: ffffff80402ea640 xilinx_dma_chan_remove+0x74/0xa0: __list_del at ./include/linux/list.h:112 (inlined by) __list_del_entry at./include/linux/list.h:135 (inlined by) list_del at ./include/linux/list.h:146 (inlined by) xilinx_dma_chan_remove at drivers/dma/xilinx/xilinx_dma.c:2546 Fixes: e81274cd6b52 ("dmaengine: add support to dynamic register/unregister of channels") Reported-by: Radhey Shyam Pandey Signed-off-by: Dave Jiang Tested-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/161099092469.2495902.5064826526660062342.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul Cc: stable@vger.kernel.org # 5.9+ --- drivers/dma/dmaengine.c | 1 - drivers/dma/idxd/dma.c | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 962cbb5e5f7f..fe6a460c4373 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1110,7 +1110,6 @@ static void __dma_async_device_channel_unregister(struct dma_device *device, "%s called while %d clients hold a reference\n", __func__, chan->client_count); mutex_lock(&dma_list_mutex); - list_del(&chan->device_node); device->chancnt--; chan->dev->chan = NULL; mutex_unlock(&dma_list_mutex); diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c index 8ed2773d8285..71fd6e4c42cd 100644 --- a/drivers/dma/idxd/dma.c +++ b/drivers/dma/idxd/dma.c @@ -205,5 +205,8 @@ int idxd_register_dma_channel(struct idxd_wq *wq) void idxd_unregister_dma_channel(struct idxd_wq *wq) { - dma_async_device_channel_unregister(&wq->idxd->dma_dev, &wq->dma_chan); + struct dma_chan *chan = &wq->dma_chan; + + dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan); + list_del(&chan->device_node); } From fed1b6a00a191cad4dd843519b590e3d6ad9f843 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 24 Jan 2021 08:09:23 +0100 Subject: [PATCH 05/69] dmaengine: ti: k3-udma: Fix a resource leak in an error handling path In 'dma_pool_create()', we return -ENOMEM, but don't release the resources already allocated, as in all the other error handling paths. Go to 'err_res_free' instead of returning directly. Fixes: 017794739702 ("dmaengine: ti: k3-udma: Initial support for K3 BCDMA") Signed-off-by: Christophe JAILLET Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20210124070923.724479-1-christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 298460438bb4..f474a1232335 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -2401,7 +2401,8 @@ static int bcdma_alloc_chan_resources(struct dma_chan *chan) dev_err(ud->ddev.dev, "Descriptor pool allocation failed\n"); uc->use_dma_pool = false; - return -ENOMEM; + ret = -ENOMEM; + goto err_res_free; } uc->use_dma_pool = true; From b64acb28da8394485f0762e657470c9fc33aca4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2021 12:36:42 +0100 Subject: [PATCH 06/69] ath9k: fix build error with LEDS_CLASS=m When CONFIG_ATH9K is built-in but LED support is in a loadable module, both ath9k drivers fails to link: x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_deinit_leds': gpio.c:(.text+0x36): undefined reference to `led_classdev_unregister' x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_init_leds': gpio.c:(.text+0x179): undefined reference to `led_classdev_register_ext' The problem is that the 'imply' keyword does not enforce any dependency but is only a weak hint to Kconfig to enable another symbol from a defconfig file. Change imply to a 'depends on LEDS_CLASS' that prevents the incorrect configuration but still allows building the driver without LED support. The 'select MAC80211_LEDS' is now ensures that the LED support is actually used if it is present, and the added Kconfig dependency on MAC80211_LEDS ensures that it cannot be enabled manually when it has no effect. Fixes: 197f466e93f5 ("ath9k_htc: Do not select MAC80211_LEDS by default") Signed-off-by: Arnd Bergmann Acked-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210125113654.2408057-1-arnd@kernel.org --- drivers/net/wireless/ath/ath9k/Kconfig | 8 ++------ net/mac80211/Kconfig | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig index a84bb9b6573f..e150d82eddb6 100644 --- a/drivers/net/wireless/ath/ath9k/Kconfig +++ b/drivers/net/wireless/ath/ath9k/Kconfig @@ -21,11 +21,9 @@ config ATH9K_BTCOEX_SUPPORT config ATH9K tristate "Atheros 802.11n wireless cards support" depends on MAC80211 && HAS_DMA + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH9K_HW select ATH9K_COMMON - imply NEW_LEDS - imply LEDS_CLASS - imply MAC80211_LEDS help This module adds support for wireless adapters based on Atheros IEEE 802.11n AR5008, AR9001 and AR9002 family @@ -176,11 +174,9 @@ config ATH9K_PCI_NO_EEPROM config ATH9K_HTC tristate "Atheros HTC based wireless cards support" depends on USB && MAC80211 + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH9K_HW select ATH9K_COMMON - imply NEW_LEDS - imply LEDS_CLASS - imply MAC80211_LEDS help Support for Atheros HTC based cards. Chipsets supported: AR9271 diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index cd9a9bd242ba..51ec8256b7fa 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -69,7 +69,7 @@ config MAC80211_MESH config MAC80211_LEDS bool "Enable LED triggers" depends on MAC80211 - depends on LEDS_CLASS + depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211 select LEDS_TRIGGERS help This option enables a few LED triggers for different From 93a1d4791c10d443bc67044def7efee2991d48b7 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 26 Jan 2021 12:02:13 +0100 Subject: [PATCH 07/69] mt76: dma: fix a possible memory leak in mt76_add_fragment() Fix a memory leak in mt76_add_fragment routine returning the buffer to the page_frag_cache when we receive a new fragment and the skb_shared_info frag array is full. Fixes: b102f0c522cf6 ("mt76: fix array overflow on receiving too many fragments for a packet") Signed-off-by: Lorenzo Bianconi Acked-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/4f9dd73407da88b2a552517ce8db242d86bf4d5c.1611616130.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/dma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 73eeb00d5aa6..e81dfaf99bcb 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -509,15 +509,17 @@ static void mt76_add_fragment(struct mt76_dev *dev, struct mt76_queue *q, void *data, int len, bool more) { - struct page *page = virt_to_head_page(data); - int offset = data - page_address(page); struct sk_buff *skb = q->rx_head; struct skb_shared_info *shinfo = skb_shinfo(skb); if (shinfo->nr_frags < ARRAY_SIZE(shinfo->frags)) { - offset += q->buf_offset; + struct page *page = virt_to_head_page(data); + int offset = data - page_address(page) + q->buf_offset; + skb_add_rx_frag(skb, shinfo->nr_frags, page, offset, len, q->buf_size); + } else { + skb_free_frag(data); } if (more) From 89e3becd8f821e507052e012d2559dcda59f538e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 1 Feb 2021 08:26:14 -0700 Subject: [PATCH 08/69] dmaengine: idxd: check device state before issue command Add device state check before executing command. Without the check the command can be issued while device is in halt state and causes the driver to block while waiting for the completion of the command. Reported-by: Sanjay Kumar Signed-off-by: Dave Jiang Tested-by: Sanjay Kumar Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support") Link: https://lore.kernel.org/r/161219313921.2976211.12222625226450097465.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 23 ++++++++++++++++++++++- drivers/dma/idxd/idxd.h | 2 +- drivers/dma/idxd/init.c | 5 ++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 95f94a3ed6be..84a6ea60ecf0 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -398,17 +398,31 @@ static inline bool idxd_is_enabled(struct idxd_device *idxd) return false; } +static inline bool idxd_device_is_halted(struct idxd_device *idxd) +{ + union gensts_reg gensts; + + gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET); + + return (gensts.state == IDXD_DEVICE_STATE_HALT); +} + /* * This is function is only used for reset during probe and will * poll for completion. Once the device is setup with interrupts, * all commands will be done via interrupt completion. */ -void idxd_device_init_reset(struct idxd_device *idxd) +int idxd_device_init_reset(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; union idxd_command_reg cmd; unsigned long flags; + if (idxd_device_is_halted(idxd)) { + dev_warn(&idxd->pdev->dev, "Device is HALTED!\n"); + return -ENXIO; + } + memset(&cmd, 0, sizeof(cmd)); cmd.cmd = IDXD_CMD_RESET_DEVICE; dev_dbg(dev, "%s: sending reset for init.\n", __func__); @@ -419,6 +433,7 @@ void idxd_device_init_reset(struct idxd_device *idxd) IDXD_CMDSTS_ACTIVE) cpu_relax(); spin_unlock_irqrestore(&idxd->dev_lock, flags); + return 0; } static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, @@ -428,6 +443,12 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, DECLARE_COMPLETION_ONSTACK(done); unsigned long flags; + if (idxd_device_is_halted(idxd)) { + dev_warn(&idxd->pdev->dev, "Device is HALTED!\n"); + *status = IDXD_CMDSTS_HW_ERR; + return; + } + memset(&cmd, 0, sizeof(cmd)); cmd.cmd = cmd_code; cmd.operand = operand; diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 5a50e91c71bf..81a0e65fd316 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -326,7 +326,7 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id); void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id); /* device control */ -void idxd_device_init_reset(struct idxd_device *idxd); +int idxd_device_init_reset(struct idxd_device *idxd); int idxd_device_enable(struct idxd_device *idxd); int idxd_device_disable(struct idxd_device *idxd); void idxd_device_reset(struct idxd_device *idxd); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 2c051e07c34c..fa04acd5582a 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -335,7 +335,10 @@ static int idxd_probe(struct idxd_device *idxd) int rc; dev_dbg(dev, "%s entered and resetting device\n", __func__); - idxd_device_init_reset(idxd); + rc = idxd_device_init_reset(idxd); + if (rc < 0) + return rc; + dev_dbg(dev, "IDXD reset complete\n"); if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) { From 548f1191d86ccb9bde2a5305988877b7584c01eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Feb 2021 23:06:36 -0800 Subject: [PATCH 09/69] bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called via do_int3 The commit 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") converted do_int3 handler to be "NMI-like". That made old if (in_nmi()) check abort execution of bpf programs attached to kprobe when kprobe is firing via int3 (For example when kprobe is placed in the middle of the function). Remove the check to restore user visible behavior. Fixes: 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") Reported-by: Nikolay Borisov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Nikolay Borisov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210203070636.70926-1-alexei.starovoitov@gmail.com --- kernel/trace/bpf_trace.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..764400260eb6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { From 6183f4d3a0a2ad230511987c6c362ca43ec0055f Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 27 Jan 2021 06:36:53 +0000 Subject: [PATCH 10/69] bpf: Check for integer overflow when using roundup_pow_of_two() On 32-bit architecture, roundup_pow_of_two() can return 0 when the argument has upper most bit set due to resulting 1UL << 32. Add a check for this case. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Bui Quang Minh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210127063653.3576-1-minhquangbui99@gmail.com --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..bfafbf115bf3 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -115,6 +115,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); + if (!n_buckets) + return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); From a4dc7eee9106a9d2a6e08b442db19677aa9699c7 Mon Sep 17 00:00:00 2001 From: Christoph Schemmel Date: Tue, 2 Feb 2021 09:45:23 +0100 Subject: [PATCH 11/69] NET: usb: qmi_wwan: Adding support for Cinterion MV31 Adding support for Cinterion MV31 with PID 0x00B7. T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=1e2d ProdID=00b7 Rev=04.14 S: Manufacturer=Cinterion S: Product=Cinterion USB Mobile Broadband S: SerialNumber=b3246eed C: #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option Signed-off-by: Christoph Schemmel Link: https://lore.kernel.org/r/20210202084523.4371-1-christoph.schemmel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index cc4819282820..5a05add9b4e6 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1309,6 +1309,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1e2d, 0x0082, 5)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0083, 4)}, /* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/ {QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)}, /* Cinterion CLS8 */ + {QMI_FIXED_INTF(0x1e2d, 0x00b7, 0)}, /* Cinterion MV31 RmNet */ {QMI_FIXED_INTF(0x413c, 0x81a2, 8)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a3, 8)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a4, 8)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ From b1bdde33b72366da20d10770ab7a49fe87b5e190 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 29 Jan 2021 20:57:43 +0100 Subject: [PATCH 12/69] netfilter: xt_recent: Fix attempt to update deleted entry When both --reap and --update flag are specified, there's a code path at which the entry to be updated is reaped beforehand, which then leads to kernel crash. Reap only entries which won't be updated. Fixes kernel bugzilla #207773. Link: https://bugzilla.kernel.org/show_bug.cgi?id=207773 Reported-by: Reindl Harald Fixes: 0079c5aee348 ("netfilter: xt_recent: add an entry reaper") Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 606411869698..0446307516cd 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) /* * Drop entries with timestamps older then 'time'. */ -static void recent_entry_reap(struct recent_table *t, unsigned long time) +static void recent_entry_reap(struct recent_table *t, unsigned long time, + struct recent_entry *working, bool update) { struct recent_entry *e; @@ -161,6 +162,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time) */ e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + /* + * Do not reap the entry which are going to be updated. + */ + if (e == working && update) + return; + /* * The last time stamp is the most recent. */ @@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) - recent_entry_reap(t, time); + recent_entry_reap(t, time, e, + info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || From a3005b0f83f217c888393c6bf9cd36e3d1616bca Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sat, 30 Jan 2021 11:14:25 +0100 Subject: [PATCH 13/69] selftests: netfilter: fix current year use date %Y instead of %G to read current year Problem appeared when running lkp-tests on 01/01/2021 Fixes: 48d072c4e8cd ("selftests: netfilter: add time counter check") Reported-by: kernel test robot Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index 087f0e6e71ce..f33154c04d34 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -23,7 +23,7 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo trap cleanup EXIT -currentyear=$(date +%G) +currentyear=$(date +%Y) lastyear=$((currentyear-1)) ip netns exec "$ns0" nft -f /dev/stdin < Date: Tue, 2 Feb 2021 16:07:37 +0100 Subject: [PATCH 14/69] netfilter: nftables: fix possible UAF over chains from packet path in netns Although hooks are released via call_rcu(), chain and rule objects are immediately released while packets are still walking over these bits. This patch adds the .pre_exit callback which is invoked before synchronize_rcu() in the netns framework to stay safe. Remove a comment which is not valid anymore since the core does not use synchronize_net() anymore since 8c873e219970 ("netfilter: core: free hooks with call_rcu"). Suggested-by: Florian Westphal Fixes: df05ef874b28 ("netfilter: nf_tables: release objects on netns destruction") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d3aa97b52e7..43fe80f10313 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8949,6 +8949,17 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); +static void __nft_release_hooks(struct net *net) +{ + struct nft_table *table; + struct nft_chain *chain; + + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(chain, &table->chains, list) + nf_tables_unregister_hook(net, table, chain); + } +} + static void __nft_release_tables(struct net *net) { struct nft_flowtable *flowtable, *nf; @@ -8964,10 +8975,6 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(table, nt, &net->nft.tables, list) { ctx.family = table->family; - - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { ctx.chain = chain; @@ -9016,6 +9023,11 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_pre_exit_net(struct net *net) +{ + __nft_release_hooks(net); +} + static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); @@ -9029,8 +9041,9 @@ static void __net_exit nf_tables_exit_net(struct net *net) } static struct pernet_operations nf_tables_net_ops = { - .init = nf_tables_init_net, - .exit = nf_tables_exit_net, + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) From 8d6bca156e47d68551750a384b3ff49384c67be3 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Tue, 2 Feb 2021 18:01:16 +0100 Subject: [PATCH 15/69] netfilter: flowtable: fix tcp and udp header checksum update When updating the tcp or udp header checksum on port nat the function inet_proto_csum_replace2 with the last parameter pseudohdr as true. This leads to an error in the case that GRO is used and packets are split up in GSO. The tcp or udp checksum of all packets is incorrect. The error is probably masked due to the fact the most network driver implement tcp/udp checksum offloading. It also only happens when GRO is applied and not on single packets. The error is most visible when using a pppoe connection which is not triggering the tcp/udp checksum offload. Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure") Signed-off-by: Sven Auhagen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 513f78db3cb2..4a4acbba78ff 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -399,7 +399,7 @@ static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, return -1; tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); return 0; } @@ -415,7 +415,7 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, - new_port, true); + new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } From 2a80c15812372e554474b1dba0b1d8e467af295d Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Tue, 2 Feb 2021 15:20:59 +0600 Subject: [PATCH 16/69] net/qrtr: restrict user-controlled length in qrtr_tun_write_iter() syzbot found WARNING in qrtr_tun_write_iter [1] when write_iter length exceeds KMALLOC_MAX_SIZE causing order >= MAX_ORDER condition. Additionally, there is no check for 0 length write. [1] WARNING: mm/page_alloc.c:5011 [..] Call Trace: alloc_pages_current+0x18c/0x2a0 mm/mempolicy.c:2267 alloc_pages include/linux/gfp.h:547 [inline] kmalloc_order+0x2e/0xb0 mm/slab_common.c:837 kmalloc_order_trace+0x14/0x120 mm/slab_common.c:853 kmalloc include/linux/slab.h:557 [inline] kzalloc include/linux/slab.h:682 [inline] qrtr_tun_write_iter+0x8a/0x180 net/qrtr/tun.c:83 call_write_iter include/linux/fs.h:1901 [inline] Reported-by: syzbot+c2a7e5c5211605a90865@syzkaller.appspotmail.com Signed-off-by: Sabyrzhan Tasbolatov Link: https://lore.kernel.org/r/20210202092059.1361381-1-snovitoll@gmail.com Signed-off-by: Jakub Kicinski --- net/qrtr/tun.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index 15ce9b642b25..b238c40a9984 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -80,6 +80,12 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; void *kbuf; + if (!len) + return -EINVAL; + + if (len > KMALLOC_MAX_SIZE) + return -ENOMEM; + kbuf = kzalloc(len, GFP_KERNEL); if (!kbuf) return -ENOMEM; From d795cc02a297df80910cf4ba23147680d15d8a7d Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 3 Feb 2021 23:37:14 +0300 Subject: [PATCH 17/69] selftests/tls: fix selftest with CHACHA20-POLY1305 TLS selftests were broken also because of use of structure that was not exported to UAPI. Fix by defining the union in tests. Fixes: 4f336e88a870 (selftests/tls: add CHACHA20-POLY1305 to tls selftests) Reported-by: Rong Chen Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/1612384634-5377-1-git-send-email-vfedorenko@novek.ru Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e0088c2d38a5..426d07875a48 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -133,7 +133,10 @@ FIXTURE_VARIANT_ADD(tls, 13_chacha) FIXTURE_SETUP(tls) { - union tls_crypto_context tls12; + union { + struct tls12_crypto_info_aes_gcm_128 aes128; + struct tls12_crypto_info_chacha20_poly1305 chacha20; + } tls12; struct sockaddr_in addr; socklen_t len; int sfd, ret; @@ -143,14 +146,16 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = variant->tls_version; - tls12.info.cipher_type = variant->cipher_type; switch (variant->cipher_type) { case TLS_CIPHER_CHACHA20_POLY1305: - tls12_sz = sizeof(tls12_crypto_info_chacha20_poly1305); + tls12_sz = sizeof(struct tls12_crypto_info_chacha20_poly1305); + tls12.chacha20.info.version = variant->tls_version; + tls12.chacha20.info.cipher_type = variant->cipher_type; break; case TLS_CIPHER_AES_GCM_128: - tls12_sz = sizeof(tls12_crypto_info_aes_gcm_128); + tls12_sz = sizeof(struct tls12_crypto_info_aes_gcm_128); + tls12.aes128.info.version = variant->tls_version; + tls12.aes128.info.cipher_type = variant->cipher_type; break; default: tls12_sz = 0; From ec7d8e7dd3a59528e305a18e93f1cb98f7faf83b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 2 Feb 2021 08:09:38 +0100 Subject: [PATCH 18/69] xen/netback: avoid race in xenvif_rx_ring_slots_available() Since commit 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding") xenvif_rx_ring_slots_available() is no longer called only from the rx queue kernel thread, so it needs to access the rx queue with the associated queue held. Reported-by: Igor Druzhinin Fixes: 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding") Signed-off-by: Juergen Gross Acked-by: Wei Liu Link: https://lore.kernel.org/r/20210202070938.7863-1-jgross@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/xen-netback/rx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c index b8febe1d1bfd..accc991d153f 100644 --- a/drivers/net/xen-netback/rx.c +++ b/drivers/net/xen-netback/rx.c @@ -38,10 +38,15 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) RING_IDX prod, cons; struct sk_buff *skb; int needed; + unsigned long flags; + + spin_lock_irqsave(&queue->rx_queue.lock, flags); skb = skb_peek(&queue->rx_queue); - if (!skb) + if (!skb) { + spin_unlock_irqrestore(&queue->rx_queue.lock, flags); return false; + } needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE); if (skb_is_gso(skb)) @@ -49,6 +54,8 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) if (skb->sw_hash) needed++; + spin_unlock_irqrestore(&queue->rx_queue.lock, flags); + do { prod = queue->rx.sring->req_prod; cons = queue->rx.req_cons; From 3401e4aa43a540881cc97190afead650e709c418 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Tue, 2 Feb 2021 23:55:11 +0530 Subject: [PATCH 19/69] cxgb4: Add new T6 PCI device id 0x6092 Signed-off-by: Raju Rangoju Link: https://lore.kernel.org/r/20210202182511.8109-1-rajur@chelsio.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 0c5373462ced..0b1b5f9c67d4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -219,6 +219,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x6089), /* Custom T62100-KR */ CH_PCI_ID_TABLE_FENTRY(0x608a), /* Custom T62100-CR */ CH_PCI_ID_TABLE_FENTRY(0x608b), /* Custom T6225-CR */ + CH_PCI_ID_TABLE_FENTRY(0x6092), /* Custom T62100-CR-LOM */ CH_PCI_DEVICE_ID_TABLE_DEFINE_END; #endif /* __T4_PCI_ID_TBL_H__ */ From 7b5eab57cac45e270a0ad624ba157c5b30b3d44d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Feb 2021 08:47:56 +0000 Subject: [PATCH 20/69] rxrpc: Fix clearance of Tx/Rx ring when releasing a call At the end of rxrpc_release_call(), rxrpc_cleanup_ring() is called to clear the Rx/Tx skbuff ring, but this doesn't lock the ring whilst it's accessing it. Unfortunately, rxrpc_resend() might be trying to retransmit a packet concurrently with this - and whilst it does lock the ring, this isn't protection against rxrpc_cleanup_call(). Fix this by removing the call to rxrpc_cleanup_ring() from rxrpc_release_call(). rxrpc_cleanup_ring() will be called again anyway from rxrpc_cleanup_call(). The earlier call is just an optimisation to recycle skbuffs more quickly. Alternative solutions include rxrpc_release_call() could try to cancel the work item or wait for it to complete or rxrpc_cleanup_ring() could lock when accessing the ring (which would require a bh lock). This can produce a report like the following: BUG: KASAN: use-after-free in rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372 Read of size 4 at addr ffff888011606e04 by task kworker/0:0/5 ... Workqueue: krxrpcd rxrpc_process_call Call Trace: ... kasan_report.cold+0x79/0xd5 mm/kasan/report.c:413 rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372 rxrpc_resend net/rxrpc/call_event.c:266 [inline] rxrpc_process_call+0x1634/0x1f60 net/rxrpc/call_event.c:412 process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 ... Allocated by task 2318: ... sock_alloc_send_pskb+0x793/0x920 net/core/sock.c:2348 rxrpc_send_data+0xb51/0x2bf0 net/rxrpc/sendmsg.c:358 rxrpc_do_sendmsg+0xc03/0x1350 net/rxrpc/sendmsg.c:744 rxrpc_sendmsg+0x420/0x630 net/rxrpc/af_rxrpc.c:560 ... Freed by task 2318: ... kfree_skb+0x140/0x3f0 net/core/skbuff.c:704 rxrpc_free_skb+0x11d/0x150 net/rxrpc/skbuff.c:78 rxrpc_cleanup_ring net/rxrpc/call_object.c:485 [inline] rxrpc_release_call+0x5dd/0x860 net/rxrpc/call_object.c:552 rxrpc_release_calls_on_socket+0x21c/0x300 net/rxrpc/call_object.c:579 rxrpc_release_sock net/rxrpc/af_rxrpc.c:885 [inline] rxrpc_release+0x263/0x5a0 net/rxrpc/af_rxrpc.c:916 __sock_release+0xcd/0x280 net/socket.c:597 ... The buggy address belongs to the object at ffff888011606dc0 which belongs to the cache skbuff_head_cache of size 232 Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Reported-by: syzbot+174de899852504e4a74a@syzkaller.appspotmail.com Reported-by: syzbot+3d1c772efafd3c38d007@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Hillf Danton Link: https://lore.kernel.org/r/161234207610.653119.5287360098400436976.stgit@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/call_object.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c845594b663f..4eb91d958a48 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -548,8 +548,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); - - rxrpc_cleanup_ring(call); _leave(""); } From 81b8be68ef8e8915d0cc6cedd2ac425c74a24813 Mon Sep 17 00:00:00 2001 From: Xie He Date: Tue, 2 Feb 2021 23:15:41 -0800 Subject: [PATCH 21/69] net: hdlc_x25: Return meaningful error code in x25_open It's not meaningful to pass on LAPB error codes to HDLC code or other parts of the system, because they will not understand the error codes. Instead, use system-wide recognizable error codes. Fixes: f362e5fe0f1f ("wan/hdlc_x25: make lapb params configurable") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xie He Acked-by: Martin Schiller Link: https://lore.kernel.org/r/20210203071541.86138-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/hdlc_x25.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c index bb164805804e..4aaa6388b9ee 100644 --- a/drivers/net/wan/hdlc_x25.c +++ b/drivers/net/wan/hdlc_x25.c @@ -169,11 +169,11 @@ static int x25_open(struct net_device *dev) result = lapb_register(dev, &cb); if (result != LAPB_OK) - return result; + return -ENOMEM; result = lapb_getparms(dev, ¶ms); if (result != LAPB_OK) - return result; + return -EINVAL; if (state(hdlc)->settings.dce) params.mode = params.mode | LAPB_DCE; @@ -188,7 +188,7 @@ static int x25_open(struct net_device *dev) result = lapb_setparms(dev, ¶ms); if (result != LAPB_OK) - return result; + return -EINVAL; return 0; } From 1d23a56b0296d29e7047b41fe0a42a001036160d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Feb 2021 19:06:55 -0600 Subject: [PATCH 22/69] net: ipa: set error code in gsi_channel_setup() In gsi_channel_setup(), we check to see if the configuration data contains any information about channels that are not supported by the hardware. If one is found, we abort the setup process, but the error code (ret) is not set in this case. Fix this bug. Fixes: 650d1603825d8 ("soc: qcom: ipa: the generic software interface") Reported-by: Dan Carpenter Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20210204010655.15619-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 34e5f2155d62..b77f5fef7aec 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1710,6 +1710,7 @@ static int gsi_channel_setup(struct gsi *gsi) if (!channel->gsi) continue; /* Ignore uninitialized channels */ + ret = -EINVAL; dev_err(gsi->dev, "channel %u not supported by hardware\n", channel_id - 1); channel_id = gsi->channel_count; From 52cbd23a119c6ebf40a527e53f3402d2ea38eccb Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 3 Feb 2021 14:29:52 -0500 Subject: [PATCH 23/69] udp: fix skb_copy_and_csum_datagram with odd segment sizes When iteratively computing a checksum with csum_block_add, track the offset "pos" to correctly rotate in csum_block_add when offset is odd. The open coded implementation of skb_copy_and_csum_datagram did this. With the switch to __skb_datagram_iter calling csum_and_copy_to_iter, pos was reinitialized to 0 on each call. Bring back the pos by passing it along with the csum to the callback. Changes v1->v2 - pass csum value, instead of csump pointer (Alexander Duyck) Link: https://lore.kernel.org/netdev/20210128152353.GB27281@optiplex/ Fixes: 950fcaecd5cc ("datagram: consolidate datagram copy to iter helpers") Reported-by: Oliver Graute Signed-off-by: Willem de Bruijn Reviewed-by: Alexander Duyck Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210203192952.1849843-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/uio.h | 8 +++++++- lib/iov_iter.c | 24 ++++++++++++++---------- net/core/datagram.c | 12 ++++++++++-- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 72d88566694e..27ff8eb786dc 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -260,7 +260,13 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i); + +struct csum_state { + __wsum csum; + size_t off; +}; + +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index a21e6a5792c5..f0b2ccb1bb01 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -592,14 +592,15 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, } static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, - __wsum *csum, struct iov_iter *i) + struct csum_state *csstate, + struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; unsigned int p_mask = pipe->ring_size - 1; + __wsum sum = csstate->csum; + size_t off = csstate->off; unsigned int i_head; size_t n, r; - size_t off = 0; - __wsum sum = *csum; if (!sanity(i)) return 0; @@ -621,7 +622,8 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, i_head++; } while (n); i->count -= bytes; - *csum = sum; + csstate->csum = sum; + csstate->off = off; return bytes; } @@ -1522,18 +1524,19 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, +size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { + struct csum_state *csstate = _csstate; const char *from = addr; - __wsum *csum = csump; __wsum sum, next; - size_t off = 0; + size_t off; if (unlikely(iov_iter_is_pipe(i))) - return csum_and_copy_to_pipe_iter(addr, bytes, csum, i); + return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); - sum = *csum; + sum = csstate->csum; + off = csstate->off; if (unlikely(iov_iter_is_discard(i))) { WARN_ON(1); /* for now */ return 0; @@ -1561,7 +1564,8 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, off += v.iov_len; }) ) - *csum = sum; + csstate->csum = sum; + csstate->off = off; return bytes; } EXPORT_SYMBOL(csum_and_copy_to_iter); diff --git a/net/core/datagram.c b/net/core/datagram.c index 81809fa735a7..15ab9ffb27fe 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - return __skb_datagram_iter(skb, offset, to, len, true, - csum_and_copy_to_iter, csump); + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; } /** From 12bc8dfb83b5292fe387b795210018b7632ee08b Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 3 Feb 2021 12:36:02 +0100 Subject: [PATCH 24/69] hv_netvsc: Reset the RSC count if NVSP_STAT_FAIL in netvsc_receive() Commit 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V values") added validation to rndis_filter_receive_data() (and rndis_filter_receive()) which introduced NVSP_STAT_FAIL-scenarios where the count is not updated/reset. Fix this omission, and prevent similar scenarios from occurring in the future. Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Fixes: 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V values") Reviewed-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20210203113602.558916-1-parri.andrea@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc.c | 5 ++++- drivers/net/hyperv/rndis_filter.c | 2 -- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 2350342b961f..13bd48a75db7 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1262,8 +1262,11 @@ static int netvsc_receive(struct net_device *ndev, ret = rndis_filter_receive(ndev, net_device, nvchan, data, buflen); - if (unlikely(ret != NVSP_STAT_SUCCESS)) + if (unlikely(ret != NVSP_STAT_SUCCESS)) { + /* Drop incomplete packet */ + nvchan->rsc.cnt = 0; status = NVSP_STAT_FAIL; + } } enq_receive_complete(ndev, net_device, q_idx, diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 598713c0d5a8..3aab2b867fc0 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -509,8 +509,6 @@ static int rndis_filter_receive_data(struct net_device *ndev, return ret; drop: - /* Drop incomplete packet */ - nvchan->rsc.cnt = 0; return NVSP_STAT_FAIL; } From 07bf34a50e327975b21a9dee64d220c3dcb72ee9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Feb 2021 15:45:11 +0200 Subject: [PATCH 25/69] net: enetc: initialize the RFS and RSS memories Michael tried to enable Advanced Error Reporting through the ENETC's Root Complex Event Collector, and the system started spitting out single bit correctable ECC errors coming from the ENETC interfaces: pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0 fsl_enetc 0000:00:00.0: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID) fsl_enetc 0000:00:00.0: device [1957:e100] error status/mask=00004000/00000000 fsl_enetc 0000:00:00.0: [14] CorrIntErr fsl_enetc 0000:00:00.1: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID) fsl_enetc 0000:00:00.1: device [1957:e100] error status/mask=00004000/00000000 fsl_enetc 0000:00:00.1: [14] CorrIntErr Further investigating the port correctable memory error detect register (PCMEDR) shows that these AER errors have an associated SOURCE_ID of 6 (RFS/RSS): $ devmem 0x1f8010e10 32 0xC0000006 $ devmem 0x1f8050e10 32 0xC0000006 Discussion with the hardware design engineers reveals that on LS1028A, the hardware does not do initialization of that RFS/RSS memory, and that software should clear/initialize the entire table before starting to operate. That comes as a bit of a surprise, since the driver does not do initialization of the RFS memory. Also, the initialization of the Receive Side Scaling is done only partially. Even though the entire ENETC IP has a single shared flow steering memory, the flow steering service should returns matches only for TCAM entries that are within the range of the Station Interface that is doing the search. Therefore, it should be sufficient for a Station Interface to initialize all of its own entries in order to avoid any ECC errors, and only the Station Interfaces in use should need initialization. There are Physical Station Interfaces associated with PCIe PFs and Virtual Station Interfaces associated with PCIe VFs. We let the PF driver initialize the entire port's memory, which includes the RFS entries which are going to be used by the VF. Reported-by: Michael Walle Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Reviewed-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20210204134511.2640309-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/enetc/enetc_hw.h | 2 + .../net/ethernet/freescale/enetc/enetc_pf.c | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index e1e950d48c92..c71fe8d751d5 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -196,6 +196,8 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_CBS_BW_MASK GENMASK(6, 0) #define ENETC_PTCCBSR1(n) (0x1114 + (n) * 8) /* n = 0 to 7*/ #define ENETC_RSSHASH_KEY_SIZE 40 +#define ENETC_PRSSCAPR 0x1404 +#define ENETC_PRSSCAPR_GET_NUM_RSS(val) (BIT((val) & 0xf) * 32) #define ENETC_PRSSK(n) (0x1410 + (n) * 4) /* n = [0..9] */ #define ENETC_PSIVLANFMR 0x1700 #define ENETC_PSIVLANFMR_VS BIT(0) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index ed8fcb8b486e..3eb5f1375bd4 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -996,6 +996,51 @@ static void enetc_phylink_destroy(struct enetc_ndev_priv *priv) phylink_destroy(priv->phylink); } +/* Initialize the entire shared memory for the flow steering entries + * of this port (PF + VFs) + */ +static int enetc_init_port_rfs_memory(struct enetc_si *si) +{ + struct enetc_cmd_rfse rfse = {0}; + struct enetc_hw *hw = &si->hw; + int num_rfs, i, err = 0; + u32 val; + + val = enetc_port_rd(hw, ENETC_PRFSCAPR); + num_rfs = ENETC_PRFSCAPR_GET_NUM_RFS(val); + + for (i = 0; i < num_rfs; i++) { + err = enetc_set_fs_entry(si, &rfse, i); + if (err) + break; + } + + return err; +} + +static int enetc_init_port_rss_memory(struct enetc_si *si) +{ + struct enetc_hw *hw = &si->hw; + int num_rss, err; + int *rss_table; + u32 val; + + val = enetc_port_rd(hw, ENETC_PRSSCAPR); + num_rss = ENETC_PRSSCAPR_GET_NUM_RSS(val); + if (!num_rss) + return 0; + + rss_table = kcalloc(num_rss, sizeof(*rss_table), GFP_KERNEL); + if (!rss_table) + return -ENOMEM; + + err = enetc_set_rss_table(si, rss_table, num_rss); + + kfree(rss_table); + + return err; +} + static int enetc_pf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -1051,6 +1096,18 @@ static int enetc_pf_probe(struct pci_dev *pdev, goto err_alloc_si_res; } + err = enetc_init_port_rfs_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RFS memory\n"); + goto err_init_port_rfs; + } + + err = enetc_init_port_rss_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RSS memory\n"); + goto err_init_port_rss; + } + err = enetc_alloc_msix(priv); if (err) { dev_err(&pdev->dev, "MSIX alloc failed\n"); @@ -1079,6 +1136,8 @@ static int enetc_pf_probe(struct pci_dev *pdev, enetc_mdiobus_destroy(pf); err_mdiobus_create: enetc_free_msix(priv); +err_init_port_rss: +err_init_port_rfs: err_alloc_msix: enetc_free_si_resources(priv); err_alloc_si_res: From 8fd54a73b7cda11548154451bdb4bde6d8ff74c7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Feb 2021 18:33:51 +0200 Subject: [PATCH 26/69] net: dsa: call teardown method on probe failure Since teardown is supposed to undo the effects of the setup method, it should be called in the error path for dsa_switch_setup, not just in dsa_switch_teardown. Fixes: 5e3f847a02aa ("net: dsa: Add teardown callback for drivers") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210204163351.2929670-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index a47e0f9b20d0..a04fd637b4cd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -462,20 +462,23 @@ static int dsa_switch_setup(struct dsa_switch *ds) ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) { err = -ENOMEM; - goto unregister_notifier; + goto teardown; } dsa_slave_mii_bus_init(ds); err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - goto unregister_notifier; + goto teardown; } ds->setup = true; return 0; +teardown: + if (ds->ops->teardown) + ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); unregister_devlink_ports: From 647b8dd5184665432cc8a2b5bca46a201f690c37 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 4 Feb 2021 20:50:34 +0300 Subject: [PATCH 27/69] selftests: txtimestamp: fix compilation issue PACKET_TX_TIMESTAMP is defined in if_packet.h but it is not included in test. Include it instead of otherwise the error of redefinition arrives. Also fix the compiler warning about ambiguous control flow by adding explicit braces. Fixes: 8fe2f761cae9 ("net-timestamp: expand documentation") Suggested-by: Willem de Bruijn Signed-off-by: Vadim Fedorenko Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/1612461034-24524-1-git-send-email-vfedorenko@novek.ru Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/txtimestamp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index 490a8cca708a..fabb1d555ee5 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -495,12 +495,12 @@ static void do_test(int family, unsigned int report_opt) total_len = cfg_payload_len; if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) { total_len += sizeof(struct udphdr); - if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) + if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) { if (family == PF_INET) total_len += sizeof(struct iphdr); else total_len += sizeof(struct ipv6hdr); - + } /* special case, only rawv6_sendmsg: * pass proto in sin6_port if not connected * also see ANK comment in net/ipv4/raw.c From 8dc1c444df193701910f5e80b5d4caaf705a8fb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Feb 2021 13:31:46 -0800 Subject: [PATCH 28/69] net: gro: do not keep too many GRO packets in napi->rx_list Commit c80794323e82 ("net: Fix packet reordering caused by GRO and listified RX cooperation") had the unfortunate effect of adding latencies in common workloads. Before the patch, GRO packets were immediately passed to upper stacks. After the patch, we can accumulate quite a lot of GRO packets (depdending on NAPI budget). My fix is counting in napi->rx_count number of segments instead of number of logical packets. Fixes: c80794323e82 ("net: Fix packet reordering caused by GRO and listified RX cooperation") Signed-off-by: Eric Dumazet Bisected-by: John Sperbeck Tested-by: Jian Yang Cc: Maxim Mikityanskiy Reviewed-by: Saeed Mahameed Reviewed-by: Edward Cree Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20210204213146.4192368-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index a979b86dbacd..449b45b843d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5735,10 +5735,11 @@ static void gro_normal_list(struct napi_struct *napi) /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) gro_normal_list(napi); } @@ -5777,7 +5778,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); return NET_RX_SUCCESS; } @@ -6067,7 +6068,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_DROP: @@ -6155,7 +6156,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_DROP: From 275a9c72b420e5051b0e92e49b26bef06c196f29 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:26 +0200 Subject: [PATCH 29/69] dpaa_eth: reserve space for the xdp_frame under the A050385 erratum When the erratum workaround is triggered, the newly created xdp_frame structure is stored at the start of the newly allocated buffer. Avoid the structure from being overwritten by explicitly reserving enough space in the buffer for storing it. Account for the fact that the structure's size might increase in time by aligning the headroom to DPAA_FD_DATA_ALIGNMENT bytes, thus guaranteeing the data's alignment. Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP") Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 4360ce4d3fb6..f3a879937d8d 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2182,6 +2182,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, struct xdp_frame *new_xdpf, *xdpf = *init_xdpf; void *new_buff; struct page *p; + int headroom; /* Check the data alignment and make sure the headroom is large * enough to store the xdpf backpointer. Use an aligned headroom @@ -2197,19 +2198,34 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, return 0; } + /* The new xdp_frame is stored in the new buffer. Reserve enough space + * in the headroom for storing it along with the driver's private + * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to + * guarantee the data's alignment in the buffer. + */ + headroom = ALIGN(sizeof(*new_xdpf) + priv->tx_headroom, + DPAA_FD_DATA_ALIGNMENT); + + /* Assure the extended headroom and data don't overflow the buffer, + * while maintaining the mandatory tailroom. + */ + if (headroom + xdpf->len > DPAA_BP_RAW_SIZE - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + return -ENOMEM; + p = dev_alloc_pages(0); if (unlikely(!p)) return -ENOMEM; /* Copy the data to the new buffer at a properly aligned offset */ new_buff = page_address(p); - memcpy(new_buff + priv->tx_headroom, xdpf->data, xdpf->len); + memcpy(new_buff + headroom, xdpf->data, xdpf->len); /* Create an XDP frame around the new buffer in a similar fashion * to xdp_convert_buff_to_frame. */ new_xdpf = new_buff; - new_xdpf->data = new_buff + priv->tx_headroom; + new_xdpf->data = new_buff + headroom; new_xdpf->len = xdpf->len; new_xdpf->headroom = priv->tx_headroom; new_xdpf->frame_sz = DPAA_BP_RAW_SIZE; From c2b0e8455eb76135f505dda81a8869e60f37a861 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:27 +0200 Subject: [PATCH 30/69] dpaa_eth: reduce data alignment requirements for the A050385 erratum The 256 byte data alignment is required for preventing DMA transaction splits when crossing 4K page boundaries. Since XDP deals only with page sized buffers or less, this restriction isn't needed. Instead, the data only needs to be aligned to 64 bytes to prevent DMA transaction splits. These lessened restrictions can increase performance by widening the pool of permitted data alignments and preventing unnecessary realignments. Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP") Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index f3a879937d8d..2a2c7db23407 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2192,7 +2192,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, * byte frame headroom. If the XDP program uses all of it, copy the * data to a new buffer and make room for storing the backpointer. */ - if (PTR_IS_ALIGNED(xdpf->data, DPAA_A050385_ALIGN) && + if (PTR_IS_ALIGNED(xdpf->data, DPAA_FD_DATA_ALIGNMENT) && xdpf->headroom >= priv->tx_headroom) { xdpf->headroom = priv->tx_headroom; return 0; From 0a9946cca1a30b7236a86757da9df2222eb73ee0 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 4 Feb 2021 18:49:28 +0200 Subject: [PATCH 31/69] dpaa_eth: try to move the data in place for the A050385 erratum The XDP frame's headroom might be large enough to accommodate the xdpf backpointer as well as shifting the data to an aligned address. Try this first before resorting to allocating a new buffer and copying the data. Suggested-by: Maciej Fijalkowski Signed-off-by: Camelia Groza Acked-by: Maciej Fijalkowski Acked-by: Madalin Bucur Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 2a2c7db23407..6faa20bed488 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2180,8 +2180,9 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, struct xdp_frame **init_xdpf) { struct xdp_frame *new_xdpf, *xdpf = *init_xdpf; - void *new_buff; + void *new_buff, *aligned_data; struct page *p; + u32 data_shift; int headroom; /* Check the data alignment and make sure the headroom is large @@ -2198,6 +2199,23 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, return 0; } + /* Try to move the data inside the buffer just enough to align it and + * store the xdpf backpointer. If the available headroom isn't large + * enough, resort to allocating a new buffer and copying the data. + */ + aligned_data = PTR_ALIGN_DOWN(xdpf->data, DPAA_FD_DATA_ALIGNMENT); + data_shift = xdpf->data - aligned_data; + + /* The XDP frame's headroom needs to be large enough to accommodate + * shifting the data as well as storing the xdpf backpointer. + */ + if (xdpf->headroom >= data_shift + priv->tx_headroom) { + memmove(aligned_data, xdpf->data, xdpf->len); + xdpf->data = aligned_data; + xdpf->headroom = priv->tx_headroom; + return 0; + } + /* The new xdp_frame is stored in the new buffer. Reserve enough space * in the headroom for storing it along with the driver's private * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to From f317e2ea8c88737aa36228167b2292baef3f0430 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Thu, 4 Feb 2021 22:03:16 +0800 Subject: [PATCH 32/69] net: stmmac: set TxQ mode back to DCB after disabling CBS When disable CBS, mode_to_use parameter is not updated even the operation mode of Tx Queue is changed to Data Centre Bridging (DCB). Therefore, when tc_setup_cbs() function is called to re-enable CBS, the operation mode of Tx Queue remains at DCB, which causing CBS fails to work. This patch updates the value of mode_to_use parameter to MTL_QUEUE_DCB after operation mode of Tx Queue is changed to DCB in stmmac_dma_qmode() callback function. Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC") Suggested-by: Vinicius Costa Gomes Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: Song, Yoong Siang Reviewed-by: Jesse Brandeburg Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/1612447396-20351-1-git-send-email-yoong.siang.song@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 8ed3b2c834a0..56985542e202 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -324,7 +324,12 @@ static int tc_setup_cbs(struct stmmac_priv *priv, priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_AVB; } else if (!qopt->enable) { - return stmmac_dma_qmode(priv, priv->ioaddr, queue, MTL_QUEUE_DCB); + ret = stmmac_dma_qmode(priv, priv->ioaddr, queue, + MTL_QUEUE_DCB); + if (ret) + return ret; + + priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB; } /* Port Transmit Rate and Speed Divider */ From ef66a1eace968ff22a35f45e6e8ec36b668b6116 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 2 Feb 2021 21:08:02 -0800 Subject: [PATCH 33/69] ibmvnic: Clear failover_pending if unable to schedule Normally we clear the failover_pending flag when processing the reset. But if we are unable to schedule a failover reset we must clear the flag ourselves. We could fail to schedule the reset if we are in PROBING state (eg: when booting via kexec) or because we could not allocate memory. Thanks to Cris Forno for helping isolate the problem and for testing. Fixes: 1d8504937478 ("powerpc/vnic: Extend "failover pending" window") Signed-off-by: Sukadev Bhattiprolu Tested-by: Cristobal Forno Link: https://lore.kernel.org/r/20210203050802.680772-1-sukadev@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f79034c786c8..a536fdbf05e1 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4918,7 +4918,22 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, complete(&adapter->init_done); adapter->init_done_rc = -EIO; } - ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + rc = ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + if (rc && rc != -EBUSY) { + /* We were unable to schedule the failover + * reset either because the adapter was still + * probing (eg: during kexec) or we could not + * allocate memory. Clear the failover_pending + * flag since no one else will. We ignore + * EBUSY because it means either FAILOVER reset + * is already scheduled or the adapter is + * being removed. + */ + netdev_err(netdev, + "Error %ld scheduling failover reset\n", + rc); + adapter->failover_pending = false; + } break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); From 5d1cbcc990f18edaddddef26677073c4e6fad7b7 Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Fri, 5 Feb 2021 13:12:06 +0100 Subject: [PATCH 34/69] net/vmw_vsock: fix NULL pointer dereference In vsock_stream_connect(), a thread will enter schedule_timeout(). While being scheduled out, another thread can enter vsock_stream_connect() as well and set vsk->transport to NULL. In case a signal was sent, the first thread can leave schedule_timeout() and vsock_transport_cancel_pkt() will be called right after. Inside vsock_transport_cancel_pkt(), a null dereference will happen on transport->cancel_pkt. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Signed-off-by: Norbert Slusarek Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/trinity-c2d6cede-bfb1-44e2-85af-1fbc7f541715-1612535117028@3c-app-gmx-bap12 Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 6894f21dc147..cb81cfb47a78 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1233,7 +1233,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; - if (!transport->cancel_pkt) + if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); From 3d0bc44d39bca615b72637e340317b7899b7f911 Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Fri, 5 Feb 2021 13:14:05 +0100 Subject: [PATCH 35/69] net/vmw_vsock: improve locking in vsock_connect_timeout() A possible locking issue in vsock_connect_timeout() was recognized by Eric Dumazet which might cause a null pointer dereference in vsock_transport_cancel_pkt(). This patch assures that vsock_transport_cancel_pkt() will be called within the lock, so a race condition won't occur which could result in vsk->transport to be set to NULL. Fixes: 380feae0def7 ("vsock: cancel packets when failing to connect") Reported-by: Eric Dumazet Signed-off-by: Norbert Slusarek Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/trinity-f8e0937a-cf0e-4d80-a76e-d9a958ba3ef1-1612535522360@3c-app-gmx-bap12 Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index cb81cfb47a78..4ea301fc2bf0 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1243,7 +1243,6 @@ static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; - int cancel = 0; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); @@ -1254,11 +1253,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); - cancel = 1; + vsock_transport_cancel_pkt(vsk); } release_sock(sk); - if (cancel) - vsock_transport_cancel_pkt(vsk); sock_put(sk); } From 225353c070fda18a23785e34e1eec2be508a3a3c Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Fri, 5 Feb 2021 21:51:14 +0200 Subject: [PATCH 36/69] net: ena: Update XDP verdict upon failure The verdict returned from ena_xdp_execute() is used to determine the fate of the RX buffer's page. In case of XDP Redirect/TX error the verdict should be set to XDP_ABORTED, otherwise the page won't be freed. Fixes: a318c70ad152 ("net: ena: introduce XDP redirect implementation") Signed-off-by: Shay Agroskin Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 06596fa1f9fe..a0596c073ddd 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -404,6 +404,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) if (unlikely(!xdpf)) { trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = XDP_ABORTED; break; } @@ -424,7 +425,10 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) xdp_stat = &rx_ring->rx_stats.xdp_redirect; break; } - fallthrough; + trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); + xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = XDP_ABORTED; + break; case XDP_ABORTED: trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; From b6c14d7a83802046f7098e9bae78fbde23affa74 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Wed, 3 Feb 2021 20:19:24 +0100 Subject: [PATCH 37/69] dmaengine dw: Revert "dmaengine: dw: Enable runtime PM" This reverts commit 842067940a3e3fc008a60fee388e000219b32632. For some solutions e.g. sound/soc/intel/catpt, DW DMA is part of a compound device (in that very example, domains: ADSP, SSP0, SSP1, DMA0 and DMA1 are part of a single entity) rather than being a standalone one. Driver for said device may enlist DMA to transfer data during suspend or resume sequences. Manipulating RPM explicitly in dw's DMA request and release channel functions causes suspend() to also invoke resume() for the exact same device. Similar situation occurs for resume() sequence. Effectively renders device dysfunctional after first suspend() attempt. Revert the change to address the problem. Fixes: 842067940a3e ("dmaengine: dw: Enable runtime PM") Cc: Andy Shevchenko Signed-off-by: Cezary Rojewski Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210203191924.15706-1-cezary.rojewski@intel.com Signed-off-by: Vinod Koul --- drivers/dma/dw/core.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 19a23767533a..7ab83fe601ed 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -982,11 +982,8 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) dev_vdbg(chan2dev(chan), "%s\n", __func__); - pm_runtime_get_sync(dw->dma.dev); - /* ASSERT: channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { - pm_runtime_put_sync_suspend(dw->dma.dev); dev_dbg(chan2dev(chan), "DMA channel not idle?\n"); return -EIO; } @@ -1003,7 +1000,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) * We need controller-specific data to set up slave transfers. */ if (chan->private && !dw_dma_filter(chan, chan->private)) { - pm_runtime_put_sync_suspend(dw->dma.dev); dev_warn(chan2dev(chan), "Wrong controller-specific data\n"); return -EINVAL; } @@ -1047,8 +1043,6 @@ static void dwc_free_chan_resources(struct dma_chan *chan) if (!dw->in_use) do_dw_dma_off(dw); - pm_runtime_put_sync_suspend(dw->dma.dev); - dev_vdbg(chan2dev(chan), "%s: done\n", __func__); } From 3c55e94c0adea4a5389c4b80f6ae9927dd6a4501 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Feb 2021 18:25:37 +0100 Subject: [PATCH 38/69] cpufreq: ACPI: Extend frequency tables to cover boost frequencies A severe performance regression on AMD EPYC processors when using the schedutil scaling governor was discovered by Phoronix.com and attributed to the following commits: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") The source of the problem is that the maximum performance level taken for computing the arch_max_freq_ratio value used in the x86 scale- invariance code is higher than the one corresponding to the cpuinfo.max_freq value coming from the acpi_cpufreq driver. This effectively causes the scale-invariant utilization to fall below 100% even if the CPU runs at cpuinfo.max_freq or slightly faster, so the schedutil governor selects a frequency below cpuinfo.max_freq then. That frequency corresponds to a frequency table entry below the maximum performance level necessary to get to the "boost" range of CPU frequencies. However, if the cpuinfo.max_freq value coming from acpi_cpufreq was higher, the schedutil governor would select higher frequencies which in turn would allow acpi_cpufreq to set more adequate performance levels and to get to the "boost" range of CPU frequencies more often. This issue affects any systems where acpi_cpufreq is used and the "boost" (or "turbo") frequencies are enabled, not just AMD EPYC. Moreover, commit db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate") from the 5.10 development cycle made it extremely easy to default to schedutil even if the preferred driver is acpi_cpufreq as long as intel_pstate is built too, because the mere presence of the latter effectively removes the ondemand governor from the defaults. Distro kernels are likely to include both intel_pstate and acpi_cpufreq on x86, so their users who cannot use intel_pstate or choose to use acpi_cpufreq may easily be affectecd by this issue. To address this issue, extend the frequency table constructed by acpi_cpufreq for each CPU to cover the entire range of available frequencies (including the "boost" ones) if CPPC is available and indicates that "boost" (or "turbo") frequencies are enabled. That causes cpuinfo.max_freq to become the maximum "boost" frequency of the given CPU (instead of the maximum frequency returned by the ACPI _PSS object that corresponds to the "nominal" performance level). Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") Fixes: db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate") Link: https://www.phoronix.com/scan.php?page=article&item=linux511-amd-schedutil&num=1 Link: https://lore.kernel.org/linux-pm/20210203135321.12253-2-ggherdovich@suse.cz/ Reported-by: Michael Larabel Diagnosed-by: Giovanni Gherdovich Signed-off-by: Rafael J. Wysocki Tested-by: Giovanni Gherdovich Reviewed-by: Giovanni Gherdovich Tested-by: Michael Larabel --- drivers/cpufreq/acpi-cpufreq.c | 109 +++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 1e4fbb002a31..4614f1c6f50a 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ struct acpi_cpufreq_data { unsigned int resume; unsigned int cpu_feature; unsigned int acpi_perf_cpu; + unsigned int first_perf_state; cpumask_var_t freqdomain_cpus; void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val); u32 (*cpu_freq_read)(struct acpi_pct_register *reg); @@ -221,10 +223,10 @@ static unsigned extract_msr(struct cpufreq_policy *policy, u32 msr) perf = to_perf_data(data); - cpufreq_for_each_entry(pos, policy->freq_table) + cpufreq_for_each_entry(pos, policy->freq_table + data->first_perf_state) if (msr == perf->states[pos->driver_data].status) return pos->frequency; - return policy->freq_table[0].frequency; + return policy->freq_table[data->first_perf_state].frequency; } static unsigned extract_freq(struct cpufreq_policy *policy, u32 val) @@ -363,6 +365,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) struct cpufreq_policy *policy; unsigned int freq; unsigned int cached_freq; + unsigned int state; pr_debug("%s (%d)\n", __func__, cpu); @@ -374,7 +377,11 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) if (unlikely(!data || !policy->freq_table)) return 0; - cached_freq = policy->freq_table[to_perf_data(data)->state].frequency; + state = to_perf_data(data)->state; + if (state < data->first_perf_state) + state = data->first_perf_state; + + cached_freq = policy->freq_table[state].frequency; freq = extract_freq(policy, get_cur_val(cpumask_of(cpu), data)); if (freq != cached_freq) { /* @@ -628,16 +635,54 @@ static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) } #endif +#ifdef CONFIG_ACPI_CPPC_LIB +static u64 get_max_boost_ratio(unsigned int cpu) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + int ret; + + if (acpi_pstate_strict) + return 0; + + ret = cppc_get_perf_caps(cpu, &perf_caps); + if (ret) { + pr_debug("CPU%d: Unable to get performance capabilities (%d)\n", + cpu, ret); + return 0; + } + + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("CPU%d: highest or nominal performance missing\n", cpu); + return 0; + } + + if (highest_perf < nominal_perf) { + pr_debug("CPU%d: nominal performance above highest\n", cpu); + return 0; + } + + return div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); +} +#else +static inline u64 get_max_boost_ratio(unsigned int cpu) { return 0; } +#endif + static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; - struct acpi_cpufreq_data *data; - unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; struct cpufreq_frequency_table *freq_table; + struct acpi_processor_performance *perf; + struct acpi_cpufreq_data *data; + unsigned int cpu = policy->cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int valid_states = 0; + unsigned int result = 0; + unsigned int state_count; + u64 max_boost_ratio; + unsigned int i; #ifdef CONFIG_SMP static int blacklisted; #endif @@ -750,8 +795,20 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) goto err_unreg; } - freq_table = kcalloc(perf->state_count + 1, sizeof(*freq_table), - GFP_KERNEL); + state_count = perf->state_count + 1; + + max_boost_ratio = get_max_boost_ratio(cpu); + if (max_boost_ratio) { + /* + * Make a room for one more entry to represent the highest + * available "boost" frequency. + */ + state_count++; + valid_states++; + data->first_perf_state = valid_states; + } + + freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL); if (!freq_table) { result = -ENOMEM; goto err_unreg; @@ -785,6 +842,30 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) valid_states++; } freq_table[valid_states].frequency = CPUFREQ_TABLE_END; + + if (max_boost_ratio) { + unsigned int state = data->first_perf_state; + unsigned int freq = freq_table[state].frequency; + + /* + * Because the loop above sorts the freq_table entries in the + * descending order, freq is the maximum frequency in the table. + * Assume that it corresponds to the CPPC nominal frequency and + * use it to populate the frequency field of the extra "boost" + * frequency entry. + */ + freq_table[0].frequency = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; + /* + * The purpose of the extra "boost" frequency entry is to make + * the rest of cpufreq aware of the real maximum frequency, but + * the way to request it is the same as for the first_perf_state + * entry that is expected to cover the entire range of "boost" + * frequencies of the CPU, so copy the driver_data value from + * that entry. + */ + freq_table[0].driver_data = freq_table[state].driver_data; + } + policy->freq_table = freq_table; perf->state = 0; @@ -858,8 +939,10 @@ static void acpi_cpufreq_cpu_ready(struct cpufreq_policy *policy) { struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, policy->cpu); + struct acpi_cpufreq_data *data = policy->driver_data; + unsigned int freq = policy->freq_table[data->first_perf_state].frequency; - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) + if (perf->states[0].core_frequency * 1000 != freq) pr_warn(FW_WARN "P-state 0 is not max freq\n"); } From d11a1d08a082a7dc0ada423d2b2e26e9b6f2525c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Feb 2021 18:34:32 +0100 Subject: [PATCH 39/69] cpufreq: ACPI: Update arch scale-invariance max perf ratio if CPPC is not there If the maximum performance level taken for computing the arch_max_freq_ratio value used in the x86 scale-invariance code is higher than the one corresponding to the cpuinfo.max_freq value coming from the acpi_cpufreq driver, the scale-invariant utilization falls below 100% even if the CPU runs at cpuinfo.max_freq or slightly faster, which causes the schedutil governor to select a frequency below cpuinfo.max_freq. That frequency corresponds to a frequency table entry below the maximum performance level necessary to get to the "boost" range of CPU frequencies which prevents "boost" frequencies from being used in some workloads. While this issue is related to scale-invariance, it may be amplified by commit db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate") from the 5.10 development cycle which made it extremely easy to default to schedutil even if the preferred driver is acpi_cpufreq as long as intel_pstate is built too, because the mere presence of the latter effectively removes the ondemand governor from the defaults. Distro kernels are likely to include both intel_pstate and acpi_cpufreq on x86, so their users who cannot use intel_pstate or choose to use acpi_cpufreq may easily be affectecd by this issue. If CPPC is available, it can be used to address this issue by extending the frequency tables created by acpi_cpufreq to cover the entire available frequency range (including "boost" frequencies) for each CPU, but if CPPC is not there, acpi_cpufreq has no idea what the maximum "boost" frequency is and the frequency tables created by it cannot be extended in a meaningful way, so in that case make it ask the arch scale-invariance code to to use the "nominal" performance level for CPU utilization scaling in order to avoid the issue at hand. Fixes: db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate") Signed-off-by: Rafael J. Wysocki Reviewed-by: Giovanni Gherdovich Acked-by: Peter Zijlstra (Intel) --- arch/x86/kernel/smpboot.c | 1 + drivers/cpufreq/acpi-cpufreq.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 117e24fbfd8a..02813a7f3a7c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1833,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool turbo_disabled(void) { diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 4614f1c6f50a..d3e5a6fceb61 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -806,6 +806,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) state_count++; valid_states++; data->first_perf_state = valid_states; + } else { + /* + * If the maximum "boost" frequency is unknown, ask the arch + * scale-invariance code to use the "nominal" performance for + * CPU utilization scaling so as to prevent the schedutil + * governor from selecting inadequate CPU frequencies. + */ + arch_set_max_freq_ratio(true); } freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL); From fe0af09074bfeb46a35357e67635eefe33cdfc49 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 6 Feb 2021 09:49:37 +0100 Subject: [PATCH 40/69] Revert "ACPICA: Interpreter: fix memory leak by using existing buffer" This reverts commit 32cf1a12cad43358e47dac8014379c2f33dfbed4. The 'exisitng buffer' in this case is the firmware provided table, and we should not modify that in place. This fixes a crash on arm64 with initrd table overrides, in which case the DSDT is not mapped with read/write permissions. Reported-by: Shawn Guo Signed-off-by: Ard Biesheuvel Tested-by: Shawn Guo Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/nsrepair2.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index d2c8d8279e7a..24c197d91f29 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -495,8 +495,9 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, union acpi_operand_object **return_object_ptr) { union acpi_operand_object *return_object = *return_object_ptr; - char *dest; + union acpi_operand_object *new_string; char *source; + char *dest; ACPI_FUNCTION_NAME(ns_repair_HID); @@ -517,6 +518,13 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, return_ACPI_STATUS(AE_OK); } + /* It is simplest to always create a new string object */ + + new_string = acpi_ut_create_string_object(return_object->string.length); + if (!new_string) { + return_ACPI_STATUS(AE_NO_MEMORY); + } + /* * Remove a leading asterisk if present. For some unknown reason, there * are many machines in the field that contains IDs like this. @@ -526,7 +534,7 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, source = return_object->string.pointer; if (*source == '*') { source++; - return_object->string.length--; + new_string->string.length--; ACPI_DEBUG_PRINT((ACPI_DB_REPAIR, "%s: Removed invalid leading asterisk\n", @@ -541,11 +549,12 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, * "NNNN####" where N is an uppercase letter or decimal digit, and * # is a hex digit. */ - for (dest = return_object->string.pointer; *source; dest++, source++) { + for (dest = new_string->string.pointer; *source; dest++, source++) { *dest = (char)toupper((int)*source); } - return_object->string.pointer[return_object->string.length] = 0; + acpi_ut_remove_reference(return_object); + *return_object_ptr = new_string; return_ACPI_STATUS(AE_OK); } From af8085f3a4712c57d0dd415ad543bac85780375c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 5 Feb 2021 11:36:30 +1100 Subject: [PATCH 41/69] net: fix iteration for sctp transport seq_files The sctp transport seq_file iterators take a reference to the transport in the ->start and ->next functions and releases the reference in the ->show function. The preferred handling for such resources is to release them in the subsequent ->next or ->stop function call. Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") there is no guarantee that ->show will be called after ->next, so this function can now leak references. So move the sctp_transport_put() call to ->next and ->stop. Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Reported-by: Xin Long Signed-off-by: NeilBrown Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sctp/proc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index f7da88ae20a5..982a87b3e11f 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -215,6 +215,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + sctp_transport_walk_stop(&iter->hti); } @@ -222,6 +228,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); @@ -277,8 +289,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); - sctp_transport_put(transport); - return 0; } @@ -354,8 +364,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(transport); - return 0; } From ce7536bc7398e2ae552d2fabb7e0e371a9f1fe46 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 8 Feb 2021 15:44:54 +0100 Subject: [PATCH 42/69] vsock/virtio: update credit only if socket is not closed If the socket is closed or is being released, some resources used by virtio_transport_space_update() such as 'vsk->trans' may be released. To avoid a use after free bug we should only update the available credit when we are sure the socket is still open and we have the lock held. Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Signed-off-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20210208144454.84438-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 5956939eebb7..e4370b1b7494 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1130,8 +1130,6 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, vsk = vsock_sk(sk); - space_available = virtio_transport_space_update(sk, pkt); - lock_sock(sk); /* Check if sk has been closed before lock_sock */ @@ -1142,6 +1140,8 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, goto free_pkt; } + space_available = virtio_transport_space_update(sk, pkt); + /* Update CID in case it has changed after a transport reset event */ vsk->local_addr.svm_cid = dst.svm_cid; From 07998281c268592963e1cd623fe6ab0270b65ae4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 5 Feb 2021 12:56:43 +0100 Subject: [PATCH 43/69] netfilter: conntrack: skip identical origin tuple in same zone only The origin skip check needs to re-test the zone. Else, we might skip a colliding tuple in the reply direction. This only occurs when using 'directional zones' where origin tuples reside in different zones but the reply tuples share the same zone. This causes the new conntrack entry to be dropped at confirmation time because NAT clash resolution was elided. Fixes: 4e35c1cb9460240 ("netfilter: nf_nat: skip nat clash resolution for same-origin entries") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 234b7cab37c3..ff0168736f6e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1229,7 +1229,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); From 664899e85c1312e51d2761e7f8b2f25d053e8489 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Feb 2021 13:20:47 +0100 Subject: [PATCH 44/69] netfilter: nftables: relax check for stateful expressions in set definition Restore the original behaviour where users are allowed to add an element with any stateful expression if the set definition specifies no stateful expressions. Make sure upper maximum number of stateful expressions of NFT_SET_EXPR_MAX is not reached. Fixes: 8cfd9b0f8515 ("netfilter: nftables: generalize set expressions support") Fixes: 48b0ae046ee9 ("netfilter: nftables: netlink support for several set element expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 43fe80f10313..8ee9f40cc0ea 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5281,6 +5281,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); + u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; @@ -5290,7 +5291,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u32 flags = 0, size = 0; u64 timeout; u64 expiration; int err, i; @@ -5356,7 +5356,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_EXPR]) { struct nft_expr *expr; - if (set->num_exprs != 1) + if (set->num_exprs && set->num_exprs != 1) return -EOPNOTSUPP; expr = nft_set_elem_expr_alloc(ctx, set, @@ -5365,8 +5365,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return PTR_ERR(expr); expr_array[0] = expr; + num_exprs = 1; - if (set->exprs[0] && set->exprs[0]->ops != expr->ops) { + if (set->num_exprs && set->exprs[0]->ops != expr->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } @@ -5375,12 +5376,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nlattr *tmp; int left; - if (set->num_exprs == 0) - return -EOPNOTSUPP; - i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { - if (i == set->num_exprs) { + if (i == NFT_SET_EXPR_MAX || + (set->num_exprs && set->num_exprs == i)) { err = -E2BIG; goto err_set_elem_expr; } @@ -5394,14 +5393,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_set_elem_expr; } expr_array[i] = expr; + num_exprs++; - if (expr->ops != set->exprs[i]->ops) { + if (set->num_exprs && expr->ops != set->exprs[i]->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } i++; } - if (set->num_exprs != i) { + if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_set_elem_expr; } @@ -5409,6 +5409,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; + + num_exprs = set->num_exprs; } err = nft_setelem_parse_key(ctx, set, &elem.key.val, @@ -5433,8 +5435,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } - if (set->num_exprs) { - for (i = 0; i < set->num_exprs; i++) + if (num_exprs) { + for (i = 0; i < num_exprs; i++) size += expr_array[i]->ops->size; nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, @@ -5522,7 +5524,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - for (i = 0; i < set->num_exprs; i++) + for (i = 0; i < num_exprs; i++) nft_set_elem_expr_setup(ext, i, expr_array); trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); @@ -5584,7 +5586,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: - for (i = 0; i < set->num_exprs && expr_array[i]; i++) + for (i = 0; i < num_exprs && expr_array[i]; i++) nft_expr_destroy(ctx, expr_array[i]); err_set_elem_expr_clone: return err; From 3aa6bce9af0e25b735c9c1263739a5639a336ae8 Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Fri, 5 Feb 2021 17:37:32 -0800 Subject: [PATCH 45/69] net: watchdog: hold device global xmit lock during tx disable Prevent netif_tx_disable() running concurrently with dev_watchdog() by taking the device global xmit lock. Otherwise, the recommended: netif_carrier_off(dev); netif_tx_disable(dev); driver shutdown sequence can happen after the watchdog has already checked carrier, resulting in possible false alarms. This is because netif_tx_lock() only sets the frozen bit without maintaining the locks on the individual queues. Fixes: c3f26a269c24 ("netdev: Fix lockdep warnings in multiqueue configurations.") Signed-off-by: Edwin Peer Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 259be67644e3..5ff27c12ce68 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4352,6 +4352,7 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_disable(); cpu = smp_processor_id(); + spin_lock(&dev->tx_global_lock); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -4359,6 +4360,7 @@ static inline void netif_tx_disable(struct net_device *dev) netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } + spin_unlock(&dev->tx_global_lock); local_bh_enable(); } From b2bdba1cbc84cadb14393d0101a5bfd38d342e0a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 6 Feb 2021 22:47:33 +0100 Subject: [PATCH 46/69] bridge: mrp: Fix the usage of br_mrp_port_switchdev_set_state The function br_mrp_port_switchdev_set_state was called both with MRP port state and STP port state, which is an issue because they don't match exactly. Therefore, update the function to be used only with STP port state and use the id SWITCHDEV_ATTR_ID_PORT_STP_STATE. The choice of using STP over MRP is that the drivers already implement SWITCHDEV_ATTR_ID_PORT_STP_STATE and already in SW we update the port STP state. Fixes: 9a9f26e8f7ea30 ("bridge: mrp: Connect MRP API with the switchdev API") Fixes: fadd409136f0f2 ("bridge: switchdev: mrp: Implement MRP API for switchdev") Fixes: 2f1a11ae11d222 ("bridge: mrp: Add MRP interface.") Reported-by: Rasmus Villemoes Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 9 ++++++--- net/bridge/br_mrp_switchdev.c | 7 +++---- net/bridge/br_private_mrp.h | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index cec2c4e4561d..5aeae6ad17b3 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -557,19 +557,22 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { + u32 port_state; + if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; spin_lock_bh(&p->br->lock); if (state == BR_MRP_PORT_STATE_FORWARDING) - p->state = BR_STATE_FORWARDING; + port_state = BR_STATE_FORWARDING; else - p->state = BR_STATE_BLOCKING; + port_state = BR_STATE_BLOCKING; + p->state = port_state; spin_unlock_bh(&p->br->lock); - br_mrp_port_switchdev_set_state(p, state); + br_mrp_port_switchdev_set_state(p, port_state); return 0; } diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index ed547e03ace1..75a7e8d0a268 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -169,13 +169,12 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, return err; } -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state) +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) { struct switchdev_attr attr = { .orig_dev = p->dev, - .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, - .u.mrp_port_state = state, + .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, + .u.stp_state = state, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 32a48e5418da..2514954c1431 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -72,8 +72,7 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, u32 interval, u8 max_miss, u32 period, bool monitor); -int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, - enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, From 059d2a1004981dce19f0127dabc1b4ec927d202a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 6 Feb 2021 22:47:34 +0100 Subject: [PATCH 47/69] switchdev: mrp: Remove SWITCHDEV_ATTR_ID_MRP_PORT_STAT Now that MRP started to use also SWITCHDEV_ATTR_ID_PORT_STP_STATE to notify HW, then SWITCHDEV_ATTR_ID_MRP_PORT_STAT is not used anywhere else, therefore we can remove it. Fixes: c284b545900830 ("switchdev: mrp: Extend switchdev API to offload MRP") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 99cd538d6519..afdf8bd1b4fe 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -42,7 +42,6 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, #if IS_ENABLED(CONFIG_BRIDGE_MRP) - SWITCHDEV_ATTR_ID_MRP_PORT_STATE, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, #endif }; @@ -62,7 +61,6 @@ struct switchdev_attr { u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ bool mc_disabled; /* MC_DISABLED */ #if IS_ENABLED(CONFIG_BRIDGE_MRP) - u8 mrp_port_state; /* MRP_PORT_STATE */ u8 mrp_port_role; /* MRP_PORT_ROLE */ #endif } u; From eb4733d7cffc547e08fe5a216e4f03663bb71108 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 8 Feb 2021 19:36:27 +0200 Subject: [PATCH 48/69] net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 17 ++++++++- drivers/net/ethernet/mscc/ocelot.c | 54 +++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_io.c | 8 ++++ include/soc/mscc/ocelot.h | 2 + 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 7dc230677b78..45fdb1256dbf 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -233,9 +233,24 @@ static void felix_phylink_mac_link_down(struct dsa_switch *ds, int port, { struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; + int err; + + ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, + DEV_MAC_ENA_CFG); - ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0); + + err = ocelot_port_flush(ocelot, port); + if (err) + dev_err(ocelot->dev, "failed to flush port %d: %d\n", + port, err); + + /* Put the port in reset. */ + ocelot_port_writel(ocelot_port, + DEV_CLOCK_CFG_MAC_TX_RST | + DEV_CLOCK_CFG_MAC_RX_RST | + DEV_CLOCK_CFG_LINK_SPEED(OCELOT_SPEED_1000), + DEV_CLOCK_CFG); } static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ff87a0bc089c..c072eb5c0764 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -375,6 +375,60 @@ static void ocelot_vlan_init(struct ocelot *ocelot) } } +static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port) +{ + return ocelot_read_rix(ocelot, QSYS_SW_STATUS, port); +} + +int ocelot_port_flush(struct ocelot *ocelot, int port) +{ + int err, val; + + /* Disable dequeuing from the egress queues */ + ocelot_rmw_rix(ocelot, QSYS_PORT_MODE_DEQUEUE_DIS, + QSYS_PORT_MODE_DEQUEUE_DIS, + QSYS_PORT_MODE, port); + + /* Disable flow control */ + ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0); + + /* Disable priority flow control */ + ocelot_fields_write(ocelot, port, + QSYS_SWITCH_PORT_MODE_TX_PFC_ENA, 0); + + /* Wait at least the time it takes to receive a frame of maximum length + * at the port. + * Worst-case delays for 10 kilobyte jumbo frames are: + * 8 ms on a 10M port + * 800 μs on a 100M port + * 80 μs on a 1G port + * 32 μs on a 2.5G port + */ + usleep_range(8000, 10000); + + /* Disable half duplex backpressure. */ + ocelot_rmw_rix(ocelot, 0, SYS_FRONT_PORT_MODE_HDX_MODE, + SYS_FRONT_PORT_MODE, port); + + /* Flush the queues associated with the port. */ + ocelot_rmw_gix(ocelot, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG_FLUSH_ENA, + REW_PORT_CFG, port); + + /* Enable dequeuing from the egress queues. */ + ocelot_rmw_rix(ocelot, 0, QSYS_PORT_MODE_DEQUEUE_DIS, QSYS_PORT_MODE, + port); + + /* Wait until flushing is complete. */ + err = read_poll_timeout(ocelot_read_eq_avail, val, !val, + 100, 2000000, false, ocelot, port); + + /* Clear flushing again. */ + ocelot_rmw_gix(ocelot, 0, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG, port); + + return err; +} +EXPORT_SYMBOL(ocelot_port_flush); + void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev) { diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c index 0acb45948418..ea4e83410fe4 100644 --- a/drivers/net/ethernet/mscc/ocelot_io.c +++ b/drivers/net/ethernet/mscc/ocelot_io.c @@ -71,6 +71,14 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg) } EXPORT_SYMBOL(ocelot_port_writel); +void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg) +{ + u32 cur = ocelot_port_readl(port, reg); + + ocelot_port_writel(port, (cur & (~mask)) | val, reg); +} +EXPORT_SYMBOL(ocelot_port_rmwl); + u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, u32 reg, u32 offset) { diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2f4cd3288bcc..c34b9ccb6472 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -709,6 +709,7 @@ struct ocelot_policer { /* I/O */ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); +void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg); u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, @@ -737,6 +738,7 @@ int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); int ocelot_get_ts_info(struct ocelot *ocelot, int port, struct ethtool_ts_info *info); void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); +int ocelot_port_flush(struct ocelot *ocelot, int port); void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev); int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled, From 67a69f84cab60484f02eb8cbc7a76edffbb28a25 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:05 +0800 Subject: [PATCH 49/69] net: hns3: add a check for queue_id in hclge_reset_vf_queue() The queue_id is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this queue_id before using it in hclge_reset_vf_queue(). Fixes: 1a426f8b40fc ("net: hns3: fix the VF queue reset flow error") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c242883fea5d..48549db23c52 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9813,12 +9813,19 @@ int hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) void hclge_reset_vf_queue(struct hclge_vport *vport, u16 queue_id) { + struct hnae3_handle *handle = &vport->nic; struct hclge_dev *hdev = vport->back; int reset_try_times = 0; int reset_status; u16 queue_gid; int ret; + if (queue_id >= handle->kinfo.num_tqps) { + dev_warn(&hdev->pdev->dev, "Invalid vf queue id(%u)\n", + queue_id); + return; + } + queue_gid = hclge_covert_handle_qid_global(&vport->nic, queue_id); ret = hclge_send_reset_tqp_cmd(hdev, queue_gid, true); From 326334aad024a60f46dc5e7dbe1efe32da3ca66f Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:06 +0800 Subject: [PATCH 50/69] net: hns3: add a check for tqp_index in hclge_get_ring_chain_from_mbx() The tqp_index is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this tqp_index before using it in hclge_get_ring_chain_from_mbx(). Fixes: 84e095d64ed9 ("net: hns3: Change PF to add ring-vect binding & resetQ to mailbox") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 754c09ada901..ea2dea990283 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -158,21 +158,31 @@ static int hclge_get_ring_chain_from_mbx( struct hclge_vport *vport) { struct hnae3_ring_chain_node *cur_chain, *new_chain; + struct hclge_dev *hdev = vport->back; int ring_num; - int i = 0; + int i; ring_num = req->msg.ring_num; if (ring_num > HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM) return -ENOMEM; + for (i = 0; i < ring_num; i++) { + if (req->msg.param[i].tqp_index >= vport->nic.kinfo.rss_size) { + dev_err(&hdev->pdev->dev, "tqp index(%u) is out of range(0-%u)\n", + req->msg.param[i].tqp_index, + vport->nic.kinfo.rss_size - 1); + return -EINVAL; + } + } + hnae3_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, - req->msg.param[i].ring_type); + req->msg.param[0].ring_type); ring_chain->tqp_index = hclge_get_queue_id(vport->nic.kinfo.tqp - [req->msg.param[i].tqp_index]); + [req->msg.param[0].tqp_index]); hnae3_set_field(ring_chain->int_gl_idx, HNAE3_RING_GL_IDX_M, - HNAE3_RING_GL_IDX_S, req->msg.param[i].int_gl_index); + HNAE3_RING_GL_IDX_S, req->msg.param[0].int_gl_index); cur_chain = ring_chain; From 532cfc0df1e4d68e74522ef4a0dcbf6ebbe68287 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 9 Feb 2021 17:03:07 +0800 Subject: [PATCH 51/69] net: hns3: add a check for index in hclge_get_rss_key() The index is received from vf, if use it directly, an out-of-bound issue may be caused, so add a check for this index before using it in hclge_get_rss_key(). Fixes: a638b1d8cc87 ("net: hns3: fix get VF RSS issue") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index ea2dea990283..ffb416e088a9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -607,6 +607,17 @@ static void hclge_get_rss_key(struct hclge_vport *vport, index = mbx_req->msg.data[0]; + /* Check the query index of rss_hash_key from VF, make sure no + * more than the size of rss_hash_key. + */ + if (((index + 1) * HCLGE_RSS_MBX_RESP_LEN) > + sizeof(vport[0].rss_hash_key)) { + dev_warn(&hdev->pdev->dev, + "failed to get the rss hash key, the index(%u) invalid !\n", + index); + return; + } + memcpy(resp_msg->data, &hdev->vport[0].rss_hash_key[index * HCLGE_RSS_MBX_RESP_LEN], HCLGE_RSS_MBX_RESP_LEN); From 1c5fae9c9a092574398a17facc31c533791ef232 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 9 Feb 2021 09:52:19 +0100 Subject: [PATCH 52/69] vsock: fix locking in vsock_shutdown() In vsock_shutdown() we touched some socket fields without holding the socket lock, such as 'state' and 'sk_flags'. Also, after the introduction of multi-transport, we are accessing 'vsk->transport' in vsock_send_shutdown() without holding the lock and this call can be made while the connection is in progress, so the transport can change in the meantime. To avoid issues, we hold the socket lock when we enter in vsock_shutdown() and release it when we leave. Among the transports that implement the 'shutdown' callback, only hyperv_transport acquired the lock. Since the caller now holds it, we no longer take it. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 8 +++++--- net/vmw_vsock/hyperv_transport.c | 4 ---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4ea301fc2bf0..5546710d8ac1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -943,10 +943,12 @@ static int vsock_shutdown(struct socket *sock, int mode) */ sk = sock->sk; + + lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sk->sk_type == SOCK_STREAM) - return err; + goto out; } else { sock->state = SS_DISCONNECTING; err = 0; @@ -955,10 +957,8 @@ static int vsock_shutdown(struct socket *sock, int mode) /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { - lock_sock(sk); sk->sk_shutdown |= mode; sk->sk_state_change(sk); - release_sock(sk); if (sk->sk_type == SOCK_STREAM) { sock_reset_flag(sk, SOCK_DONE); @@ -966,6 +966,8 @@ static int vsock_shutdown(struct socket *sock, int mode) } } +out: + release_sock(sk); return err; } diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 630b851f8150..cc3bae2659e7 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -474,14 +474,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) static int hvs_shutdown(struct vsock_sock *vsk, int mode) { - struct sock *sk = sk_vsock(vsk); - if (!(mode & SEND_SHUTDOWN)) return 0; - lock_sock(sk); hvs_shutdown_lock_held(vsk->trans, mode); - release_sock(sk); return 0; } From ee114dd64c0071500345439fc79dd5e0f9d106ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 17:20:14 +0100 Subject: [PATCH 53/69] bpf: Fix verifier jsgt branch analysis on max bound Fix incorrect is_branch{32,64}_taken() analysis for the jsgt case. The return code for both will tell the caller whether a given conditional jump is taken or not, e.g. 1 means branch will be taken [for the involved registers] and the goto target will be executed, 0 means branch will not be taken and instead we fall-through to the next insn, and last but not least a -1 denotes that it is not known at verification time whether a branch will be taken or not. Now while the jsgt has the branch-taken case correct with reg->s32_min_value > sval, the branch-not-taken case is off-by-one when testing for reg->s32_max_value < sval since the branch will also be taken for reg->s32_max_value == sval. The jgt branch analysis, for example, gets this right. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Fixes: 4f7b3e82589e ("bpf: improve verifier branch analysis") Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7368c5eacb7..67ea0ac3333c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6877,7 +6877,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JSGT: if (reg->s32_min_value > sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg->s32_max_value <= sval) return 0; break; case BPF_JLT: @@ -6950,7 +6950,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JSGT: if (reg->smin_value > sval) return 1; - else if (reg->smax_value < sval) + else if (reg->smax_value <= sval) return 0; break; case BPF_JLT: From fd675184fc7abfd1e1c52d23e8e900676b5a1c1a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 20:48:21 +0100 Subject: [PATCH 54/69] bpf: Fix verifier jmp32 pruning decision logic Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 808464450 1: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b4) w4 = 808464432 2: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP808464432 R10=fp0 2: (9c) w4 %= w0 3: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 3: (66) if w4 s> 0x30303030 goto pc+0 R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit propagating r0 from 6 to 7: safe 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 propagating r0 7: safe propagating r0 from 6 to 7: safe processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 The underlying program was xlated as follows: # bpftool p d x i 10 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 5: (66) if w4 s> 0x30303030 goto pc+0 6: (7f) r0 >>= r0 7: (bc) w0 = w0 8: (15) if r0 == 0x0 goto pc+1 9: (9c) w4 %= w0 10: (66) if w0 s> 0x3030 goto pc+0 11: (d6) if w0 s<= 0x303030 goto pc+1 12: (05) goto pc-1 13: (95) exit The verifier rewrote original instructions it recognized as dead code with 'goto pc-1', but reality differs from verifier simulation in that we are actually able to trigger a hang due to hitting the 'goto pc-1' instructions. Taking a closer look at the verifier analysis, the reason is that it misjudges its pruning decision at the first 'from 6 to 7: safe' occasion. What happens is that while both old/cur registers are marked as precise, they get misjudged for the jmp32 case as range_within() yields true, meaning that the prior verification path with a wider register bound could be verified successfully and therefore the current path with a narrower register bound is deemed safe as well whereas in reality it's not. R0 old/cur path's bounds compare as follows: old: smin_value=0x8000000000000000,smax_value=0x7fffffffffffffff,umin_value=0x0,umax_value=0xffffffffffffffff,var_off=(0x0; 0xffffffffffffffff) cur: smin_value=0x8000000000000000,smax_value=0x7fffffff7fffffff,umin_value=0x0,umax_value=0xffffffff7fffffff,var_off=(0x0; 0xffffffff7fffffff) old: s32_min_value=0x80000000,s32_max_value=0x00003030,u32_min_value=0x00000000,u32_max_value=0xffffffff cur: s32_min_value=0x00003031,s32_max_value=0x7fffffff,u32_min_value=0x00003031,u32_max_value=0x7fffffff The 64 bit bounds generally look okay and while the information that got propagated from 32 to 64 bit looks correct as well, it's not precise enough for judging a conditional jmp32. Given the latter only operates on subregisters we also need to take these into account as well for a range_within() probe in order to be able to prune paths. Extending the range_within() constraint to both bounds will be able to tell us that the old signed 32 bit bounds are not wider than the cur signed 32 bit bounds. With the fix in place, the program will now verify the 'goto' branch case as it should have been: [...] 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit 7: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=12337,u32_min_value=12337,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 11 insns (limit 1000000) max_states_per_insn 1 total_states 1 peak_states 1 mark_read 1 The bug is quite subtle in the sense that when verifier would determine that a given branch is dead code, it would (here: wrongly) remove these instructions from the program and hard-wire the taken branch for privileged programs instead of the 'goto pc-1' rewrites which will cause hard to debug problems. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67ea0ac3333c..24c0e9224f3c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8590,7 +8590,11 @@ static bool range_within(struct bpf_reg_state *old, return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value; + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; } /* Maximum number of register states that can exist at once */ From e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Feb 2021 18:46:10 +0000 Subject: [PATCH 55/69] bpf: Fix 32 bit src register truncation on div/mod While reviewing a different fix, John and I noticed an oddity in one of the BPF program dumps that stood out, for example: # bpftool p d x i 13 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 [...] In line 2 we noticed that the mov32 would 32 bit truncate the original src register for the div/mod operation. While for the two operations the dst register is typically marked unknown e.g. from adjust_scalar_min_max_vals() the src register is not, and thus verifier keeps tracking original bounds, simplified: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = -1 1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b7) r1 = -1 2: R0_w=invP-1 R1_w=invP-1 R10=fp0 2: (3c) w0 /= w1 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0 3: (77) r1 >>= 32 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0 4: (bf) r0 = r1 5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0 5: (95) exit processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Runtime result of r0 at exit is 0 instead of expected -1. Remove the verifier mov32 src rewrite in div/mod and replace it with a jmp32 test instead. After the fix, we result in the following code generation when having dividend r1 and divisor r6: div, 64 bit: div, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (55) if r6 != 0x0 goto pc+2 2: (56) if w6 != 0x0 goto pc+2 3: (ac) w1 ^= w1 3: (ac) w1 ^= w1 4: (05) goto pc+1 4: (05) goto pc+1 5: (3f) r1 /= r6 5: (3c) w1 /= w6 6: (b7) r0 = 0 6: (b7) r0 = 0 7: (95) exit 7: (95) exit mod, 64 bit: mod, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (15) if r6 == 0x0 goto pc+1 2: (16) if w6 == 0x0 goto pc+1 3: (9f) r1 %= r6 3: (9c) w1 %= w6 4: (b7) r0 = 0 4: (b7) r0 = 0 5: (95) exit 5: (95) exit x86 in particular can throw a 'divide error' exception for div instruction not only for divisor being zero, but also for the case when the quotient is too large for the designated register. For the edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT since we always zero edx (rdx). Hence really the only protection needed is against divisor being zero. Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero") Co-developed-by: John Fastabend Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 24c0e9224f3c..37581919e050 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11003,30 +11003,28 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { /* Rx div 0 -> 0 */ - BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + struct bpf_insn chk_and_mod[] = { /* Rx mod 0 -> Rx */ - BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1, 0), *insn, }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); - } else { - patchlet = mask_and_mod + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) From e812cbbbbbb15adbbbee176baa1e8bda53059bf0 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 9 Feb 2021 13:41:50 -0800 Subject: [PATCH 56/69] squashfs: avoid out of bounds writes in decompressors Patch series "Squashfs: fix BIO migration regression and add sanity checks". Patch [1/4] fixes a regression introduced by the "migrate from ll_rw_block usage to BIO" patch, which has produced a number of Sysbot/Syzkaller reports. Patches [2/4], [3/4], and [4/4] fix a number of filesystem corruption issues which have produced Sysbot reports in the id, inode and xattr lookup code. Each patch has been tested against the Sysbot reproducers using the given kernel configuration. They have the appropriate "Reported-by:" lines added. Additionally, all of the reproducer filesystems are indirectly fixed by patch [4/4] due to the fact they all have xattr corruption which is now detected there. Additional testing with other configurations and architectures (32bit, big endian), and normal filesystems has also been done to trap any inadvertent regressions caused by the additional sanity checks. This patch (of 4): This is a regression introduced by the patch "migrate from ll_rw_block usage to BIO". Sysbot/Syskaller has reported a number of "out of bounds writes" and "unable to handle kernel paging request in squashfs_decompress" errors which have been identified as a regression introduced by the above patch. Specifically, the patch removed the following sanity check if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) This check did two things: 1. It ensured any reads were not beyond the end of the filesystem 2. It ensured that the "length" field read from the filesystem was within the expected maximum length. Without this any corrupted values can over-run allocated buffers. Link: https://lkml.kernel.org/r/20210204130249.4495-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20210204130249.4495-2-phillip@squashfs.org.uk Fixes: 93e72b3c612adc ("squashfs: migrate from ll_rw_block usage to BIO") Reported-by: syzbot+6fba78f99b9afd4b5634@syzkaller.appspotmail.com Signed-off-by: Phillip Lougher Cc: Philippe Liard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/block.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8a19773b5a0b..45f44425d856 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -196,9 +196,15 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, length = SQUASHFS_COMPRESSED_SIZE(length); index += 2; - TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) { + res = -EIO; + goto out; + } + if (next_index) *next_index = index + length; From f37aa4c7366e23f91b81d00bafd6a7ab54e4a381 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 9 Feb 2021 13:41:53 -0800 Subject: [PATCH 57/69] squashfs: add more sanity checks in id lookup Sysbot has reported a number of "slab-out-of-bounds reads" and "use-after-free read" errors which has been identified as being caused by a corrupted index value read from the inode. This could be because the metadata block is uncompressed, or because the "compression" bit has been corrupted (turning a compressed block into an uncompressed block). This patch adds additional sanity checks to detect this, and the following corruption. 1. It checks against corruption of the ids count. This can either lead to a larger table to be read, or a smaller than expected table to be read. In the case of a too large ids count, this would often have been trapped by the existing sanity checks, but this patch introduces a more exact check, which can identify too small values. 2. It checks the contents of the index table for corruption. Link: https://lkml.kernel.org/r/20210204130249.4495-3-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: syzbot+b06d57ba83f604522af2@syzkaller.appspotmail.com Reported-by: syzbot+c021ba012da41ee9807c@syzkaller.appspotmail.com Reported-by: syzbot+5024636e8b5fd19f0f19@syzkaller.appspotmail.com Reported-by: syzbot+bcbc661df46657d0fa4f@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/id.c | 40 ++++++++++++++++++++++++++++-------- fs/squashfs/squashfs_fs_sb.h | 1 + fs/squashfs/super.c | 6 +++--- fs/squashfs/xattr.h | 10 ++++++++- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 6be5afe7287d..11581bf31af4 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -35,10 +35,15 @@ int squashfs_get_id(struct super_block *sb, unsigned int index, struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_ID_BLOCK(index); int offset = SQUASHFS_ID_BLOCK_OFFSET(index); - u64 start_block = le64_to_cpu(msblk->id_table[block]); + u64 start_block; __le32 disk_id; int err; + if (index >= msblk->ids) + return -EINVAL; + + start_block = le64_to_cpu(msblk->id_table[block]); + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, sizeof(disk_id)); if (err < 0) @@ -56,7 +61,10 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, u64 id_table_start, u64 next_table, unsigned short no_ids) { unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids); + int n; __le64 *table; + u64 start, end; TRACE("In read_id_index_table, length %d\n", length); @@ -67,20 +75,36 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, return ERR_PTR(-EINVAL); /* - * length bytes should not extend into the next table - this check - * also traps instances where id_table_start is incorrectly larger - * than the next table start + * The computed size of the index table (length bytes) should exactly + * match the table start and end points */ - if (id_table_start + length > next_table) + if (length != (next_table - id_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, id_table_start, length); + if (IS_ERR(table)) + return table; /* - * table[0] points to the first id lookup table metadata block, this - * should be less than id_table_start + * table[0], table[1], ... table[indexes - 1] store the locations + * of the compressed id blocks. Each entry should be less than + * the next (i.e. table[0] < table[1]), and the difference between them + * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] + * should be less than id_table_start, and again the difference + * should be SQUASHFS_METADATA_SIZE or less */ - if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) { + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); + + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 34c21ffb6df3..166e98806265 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -64,5 +64,6 @@ struct squashfs_sb_info { unsigned int inodes; unsigned int fragments; int xattr_ids; + unsigned int ids; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index d6c6593ec169..88cc94be1076 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -166,6 +166,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->directory_table = le64_to_cpu(sblk->directory_table_start); msblk->inodes = le32_to_cpu(sblk->inodes); msblk->fragments = le32_to_cpu(sblk->fragments); + msblk->ids = le16_to_cpu(sblk->no_ids); flags = le16_to_cpu(sblk->flags); TRACE("Found valid superblock on %pg\n", sb->s_bdev); @@ -177,7 +178,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Block size %d\n", msblk->block_size); TRACE("Number of inodes %d\n", msblk->inodes); TRACE("Number of fragments %d\n", msblk->fragments); - TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("Number of ids %d\n", msblk->ids); TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); TRACE("sblk->fragment_table_start %llx\n", @@ -236,8 +237,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) allocate_id_index_table: /* Allocate and read id index table */ msblk->id_table = squashfs_read_id_index_table(sb, - le64_to_cpu(sblk->id_table_start), next_table, - le16_to_cpu(sblk->no_ids)); + le64_to_cpu(sblk->id_table_start), next_table, msblk->ids); if (IS_ERR(msblk->id_table)) { errorf(fc, "unable to read id index table"); err = PTR_ERR(msblk->id_table); diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index 184129afd456..d8a270d3ac4c 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -17,8 +17,16 @@ extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) { + struct squashfs_xattr_id_table *id_table; + + id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + if (IS_ERR(id_table)) + return (__le64 *) id_table; + + *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); + kfree(id_table); + ERROR("Xattrs in filesystem, these will be ignored\n"); - *xattr_table_start = start; return ERR_PTR(-ENOTSUPP); } From eabac19e40c095543def79cb6ffeb3a8588aaff4 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 9 Feb 2021 13:41:56 -0800 Subject: [PATCH 58/69] squashfs: add more sanity checks in inode lookup Sysbot has reported an "slab-out-of-bounds read" error which has been identified as being caused by a corrupted "ino_num" value read from the inode. This could be because the metadata block is uncompressed, or because the "compression" bit has been corrupted (turning a compressed block into an uncompressed block). This patch adds additional sanity checks to detect this, and the following corruption. 1. It checks against corruption of the inodes count. This can either lead to a larger table to be read, or a smaller than expected table to be read. In the case of a too large inodes count, this would often have been trapped by the existing sanity checks, but this patch introduces a more exact check, which can identify too small values. 2. It checks the contents of the index table for corruption. [phillip@squashfs.org.uk: fix checkpatch issue] Link: https://lkml.kernel.org/r/527909353.754618.1612769948607@webmail.123-reg.co.uk Link: https://lkml.kernel.org/r/20210204130249.4495-4-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: syzbot+04419e3ff19d2970ea28@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/export.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index ae2c87bb0fbe..eb02072d28dd 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -41,12 +41,17 @@ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) struct squashfs_sb_info *msblk = sb->s_fs_info; int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); - u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + u64 start; __le64 ino; int err; TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + if (ino_num == 0 || (ino_num - 1) >= msblk->inodes) + return -EINVAL; + + start = le64_to_cpu(msblk->inode_lookup_table[blk]); + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); if (err < 0) return err; @@ -111,7 +116,10 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes); + int n; __le64 *table; + u64 start, end; TRACE("In read_inode_lookup_table, length %d\n", length); @@ -121,20 +129,37 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, if (inodes == 0) return ERR_PTR(-EINVAL); - /* length bytes should not extend into the next table - this check - * also traps instances where lookup_table_start is incorrectly larger - * than the next table start + /* + * The computed size of the lookup table (length bytes) should exactly + * match the table start and end points */ - if (lookup_table_start + length > next_table) + if (length != (next_table - lookup_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, lookup_table_start, length); + if (IS_ERR(table)) + return table; /* - * table[0] points to the first inode lookup table metadata block, - * this should be less than lookup_table_start + * table0], table[1], ... table[indexes - 1] store the locations + * of the compressed inode lookup blocks. Each entry should be + * less than the next (i.e. table[0] < table[1]), and the difference + * between them should be SQUASHFS_METADATA_SIZE or less. + * table[indexes - 1] should be less than lookup_table_start, and + * again the difference should be SQUASHFS_METADATA_SIZE or less */ - if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) { + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); + + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) { kfree(table); return ERR_PTR(-EINVAL); } From 506220d2ba21791314af569211ffd8870b8208fa Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 9 Feb 2021 13:42:00 -0800 Subject: [PATCH 59/69] squashfs: add more sanity checks in xattr id lookup Sysbot has reported a warning where a kmalloc() attempt exceeds the maximum limit. This has been identified as corruption of the xattr_ids count when reading the xattr id lookup table. This patch adds a number of additional sanity checks to detect this corruption and others. 1. It checks for a corrupted xattr index read from the inode. This could be because the metadata block is uncompressed, or because the "compression" bit has been corrupted (turning a compressed block into an uncompressed block). This would cause an out of bounds read. 2. It checks against corruption of the xattr_ids count. This can either lead to the above kmalloc failure, or a smaller than expected table to be read. 3. It checks the contents of the index table for corruption. [phillip@squashfs.org.uk: fix checkpatch issue] Link: https://lkml.kernel.org/r/270245655.754655.1612770082682@webmail.123-reg.co.uk Link: https://lkml.kernel.org/r/20210204130249.4495-5-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: syzbot+2ccea6339d368360800d@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/xattr_id.c | 66 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index d99e08464554..ead66670b41a 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -31,10 +31,15 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_XATTR_BLOCK(index); int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); - u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + u64 start_block; struct squashfs_xattr_id id; int err; + if (index >= msblk->xattr_ids) + return -EINVAL; + + start_block = le64_to_cpu(msblk->xattr_id_table[block]); + err = squashfs_read_metadata(sb, &id, &start_block, &offset, sizeof(id)); if (err < 0) @@ -50,13 +55,17 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, /* * Read uncompressed xattr id lookup table indexes from disk into memory */ -__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, u64 *xattr_table_start, int *xattr_ids) { - unsigned int len; + struct squashfs_sb_info *msblk = sb->s_fs_info; + unsigned int len, indexes; struct squashfs_xattr_id_table *id_table; + __le64 *table; + u64 start, end; + int n; - id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + id_table = squashfs_read_table(sb, table_start, sizeof(*id_table)); if (IS_ERR(id_table)) return (__le64 *) id_table; @@ -70,13 +79,52 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, if (*xattr_ids == 0) return ERR_PTR(-EINVAL); - /* xattr_table should be less than start */ - if (*xattr_table_start >= start) + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids); + + /* + * The computed size of the index table (len bytes) should exactly + * match the table start and end points + */ + start = table_start + sizeof(*id_table); + end = msblk->bytes_used; + + if (len != (end - start)) return ERR_PTR(-EINVAL); - len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + table = squashfs_read_table(sb, start, len); + if (IS_ERR(table)) + return table; - TRACE("In read_xattr_index_table, length %d\n", len); + /* table[0], table[1], ... table[indexes - 1] store the locations + * of the compressed xattr id blocks. Each entry should be less than + * the next (i.e. table[0] < table[1]), and the difference between them + * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] + * should be less than table_start, and again the difference + * shouls be SQUASHFS_METADATA_SIZE or less. + * + * Finally xattr_table_start should be less than table[0]. + */ + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); - return squashfs_read_table(sb, start + sizeof(*id_table), len); + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + if (*xattr_table_start >= le64_to_cpu(table[0])) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; } From 1cc4cdb521f9689183474bc89eefc451ac44fa1c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 9 Feb 2021 13:42:03 -0800 Subject: [PATCH 60/69] kasan: fix stack traces dependency for HW_TAGS Currently, whether the alloc/free stack traces collection is enabled by default for hardware tag-based KASAN depends on CONFIG_DEBUG_KERNEL. The intention for this dependency was to only enable collection on slow debug kernels due to a significant perf and memory impact. As it turns out, CONFIG_DEBUG_KERNEL is not considered a debug option and is enabled on many productions kernels including Android and Ubuntu. As the result, this dependency is pointless and only complicates the code and documentation. Having stack traces collection disabled by default would make the hardware mode work differently to to the software ones, which is confusing. This change removes the dependency and enables stack traces collection by default. Looking into the future, this default might makes sense for production kernels, assuming we implement a fast stack trace collection approach. Link: https://lkml.kernel.org/r/6678d77ceffb71f1cff2cf61560e2ffe7bb6bfe9.1612808820.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 3 +-- mm/kasan/hw_tags.c | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 1651d961f06a..a248ac3941be 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -163,8 +163,7 @@ particular KASAN features. - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on`` for ``CONFIG_DEBUG_KERNEL=y``, otherwise - ``off``). + traces collection (default: ``on``). - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN report or also panic the kernel (default: ``report``). diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index e529428e7a11..d558799b25b3 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -134,12 +134,8 @@ void __init kasan_init_hw_tags(void) switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* - * Default to enabling stack trace collection for - * debug kernels. - */ - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - static_branch_enable(&kasan_flag_stacktrace); + /* Default to enabling stack trace collection. */ + static_branch_enable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_OFF: /* Do nothing, kasan_flag_stacktrace keeps its default value. */ From 793f49a87aae24e5bcf92ad98d764153fc936570 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 9 Feb 2021 13:42:07 -0800 Subject: [PATCH 61/69] firmware_loader: align .builtin_fw to 8 arm64 references the start address of .builtin_fw (__start_builtin_fw) with a pair of R_AARCH64_ADR_PREL_PG_HI21/R_AARCH64_LDST64_ABS_LO12_NC relocations. The compiler is allowed to emit the R_AARCH64_LDST64_ABS_LO12_NC relocation because struct builtin_fw in include/linux/firmware.h is 8-byte aligned. The R_AARCH64_LDST64_ABS_LO12_NC relocation requires the address to be a multiple of 8, which may not be the case if .builtin_fw is empty. Unconditionally align .builtin_fw to fix the linker error. 32-bit architectures could use ALIGN(4) but that would add unnecessary complexity, so just use ALIGN(8). Link: https://lkml.kernel.org/r/20201208054646.2913063-1-maskray@google.com Link: https://github.com/ClangBuiltLinux/linux/issues/1204 Fixes: 5658c76 ("firmware: allow firmware files to be built into kernel image") Signed-off-by: Fangrui Song Reported-by: kernel test robot Acked-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Tested-by: Douglas Anderson Acked-by: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b2b3d81b1535..b97c628ad91f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -459,7 +459,7 @@ } \ \ /* Built-in firmware blobs */ \ - .builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) { \ + .builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) ALIGN(8) { \ __start_builtin_fw = .; \ KEEP(*(.builtin_fw)) \ __end_builtin_fw = .; \ From a30a29091b5a6d4c64b5fc77040720a65e2dd4e6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Feb 2021 13:42:10 -0800 Subject: [PATCH 62/69] mm/mremap: fix BUILD_BUG_ON() error in get_extent clang can't evaluate this function argument at compile time when the function is not inlined, which leads to a link time failure: ld.lld: error: undefined symbol: __compiletime_assert_414 >>> referenced by mremap.c >>> mremap.o:(get_extent) in archive mm/built-in.a Mark the function as __always_inline to avoid it. Link: https://lkml.kernel.org/r/20201230154104.522605-1-arnd@kernel.org Fixes: 9ad9718bfa41 ("mm/mremap: calculate extent in one place") Signed-off-by: Arnd Bergmann Tested-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Tested-by: Sedat Dilek Cc: Kirill A. Shutemov" Cc: Wei Yang Cc: Vlastimil Babka Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Brian Geffon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index f554320281cc..aa63bfd3cad2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -336,8 +336,9 @@ enum pgt_entry { * valid. Else returns a smaller extent bounded by the end of the source and * destination pgt_entry. */ -static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, - unsigned long old_end, unsigned long new_addr) +static __always_inline unsigned long get_extent(enum pgt_entry entry, + unsigned long old_addr, unsigned long old_end, + unsigned long new_addr) { unsigned long next, extent, mask, size; From b85a7a8bb5736998b8a681937a9749b350c17988 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 9 Feb 2021 13:42:14 -0800 Subject: [PATCH 63/69] tmpfs: disallow CONFIG_TMPFS_INODE64 on s390 Currently there is an assumption in tmpfs that 64-bit architectures also have a 64-bit ino_t. This is not true on s390 which has a 32-bit ino_t. With CONFIG_TMPFS_INODE64=y tmpfs mounts will get 64-bit inode numbers and display "inode64" in the mount options, but passing the "inode64" mount option will fail. This leads to the following behavior: # mkdir mnt # mount -t tmpfs nodev mnt # mount -o remount,rw mnt mount: /home/ubuntu/mnt: mount point not mounted or bad option. As mount sees "inode64" in the mount options and thus passes it in the options for the remount. So prevent CONFIG_TMPFS_INODE64 from being selected on s390. Link: https://lkml.kernel.org/r/20210205230620.518245-1-seth.forshee@canonical.com Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb") Signed-off-by: Seth Forshee Acked-by: Hugh Dickins Cc: Chris Down Cc: Hugh Dickins Cc: Amir Goldstein Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: [5.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index aa4c12282301..3347ec7bd837 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,7 +203,7 @@ config TMPFS_XATTR config TMPFS_INODE64 bool "Use 64-bit ino_t by default in tmpfs" - depends on TMPFS && 64BIT + depends on TMPFS && 64BIT && !S390 default n help tmpfs has historically used only inode numbers as wide as an unsigned From ad69c389ec110ea54f8b0c0884b255340ef1c736 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 9 Feb 2021 13:42:17 -0800 Subject: [PATCH 64/69] tmpfs: disallow CONFIG_TMPFS_INODE64 on alpha As with s390, alpha is a 64-bit architecture with a 32-bit ino_t. With CONFIG_TMPFS_INODE64=y tmpfs mounts will get 64-bit inode numbers and display "inode64" in the mount options, whereas passing "inode64" in the mount options will fail. This leads to erroneous behaviours such as this: # mkdir mnt # mount -t tmpfs nodev mnt # mount -o remount,rw mnt mount: /home/ubuntu/mnt: mount point not mounted or bad option. Prevent CONFIG_TMPFS_INODE64 from being selected on alpha. Link: https://lkml.kernel.org/r/20210208215726.608197-1-seth.forshee@canonical.com Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb") Signed-off-by: Seth Forshee Acked-by: Hugh Dickins Cc: Chris Down Cc: Amir Goldstein Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: [5.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index 3347ec7bd837..da524c4d7b7e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,7 +203,7 @@ config TMPFS_XATTR config TMPFS_INODE64 bool "Use 64-bit ino_t by default in tmpfs" - depends on TMPFS && 64BIT && !S390 + depends on TMPFS && 64BIT && !(S390 || ALPHA) default n help tmpfs has historically used only inode numbers as wide as an unsigned From d52db800846f66d98a4e14c39cf88a06bcd9985f Mon Sep 17 00:00:00 2001 From: Rong Chen Date: Tue, 9 Feb 2021 13:42:21 -0800 Subject: [PATCH 65/69] selftests/vm: rename file run_vmtests to run_vmtests.sh Commit c2aa8afc36fa has renamed run_vmtests in Makefile, but the file still uses the old name. The kernel test robot reported the following issue: # selftests: vm: run_vmtests.sh # Warning: file run_vmtests.sh is missing! not ok 1 selftests: vm: run_vmtests.sh Link: https://lkml.kernel.org/r/20210205085507.1479894-1-rong.a.chen@intel.com Fixes: c2aa8afc36fa (selftests/vm: rename run_vmtests --> run_vmtests.sh) Signed-off-by: Rong Chen Reported-by: kernel test robot Reviewed-by: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/{run_vmtests => run_vmtests.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tools/testing/selftests/vm/{run_vmtests => run_vmtests.sh} (100%) diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests.sh similarity index 100% rename from tools/testing/selftests/vm/run_vmtests rename to tools/testing/selftests/vm/run_vmtests.sh From a0c2eb0a4387322ebc629c01f5adb2d957c343fe Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 9 Feb 2021 13:42:24 -0800 Subject: [PATCH 66/69] MAINTAINERS: update Andrey Ryabinin's email address Update my email, @virtuozzo.com will stop working shortly. Link: https://lkml.kernel.org/r/20210204223904.3824-1-ryabinin.a.a@gmail.com Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index d674968df008..7fdf87b24fe8 100644 --- a/.mailmap +++ b/.mailmap @@ -37,6 +37,7 @@ Andrew Murray Andrew Murray Andrew Vasquez Andrey Ryabinin +Andrey Ryabinin Andy Adamson Antoine Tenart Antoine Tenart diff --git a/MAINTAINERS b/MAINTAINERS index 667d03852191..64c7169db617 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9559,7 +9559,7 @@ F: Documentation/hwmon/k8temp.rst F: drivers/hwmon/k8temp.c KASAN -M: Andrey Ryabinin +M: Andrey Ryabinin R: Alexander Potapenko R: Dmitry Vyukov L: kasan-dev@googlegroups.com From e82553c10b0899994153f9bf0af333c0a1550fd7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 9 Feb 2021 13:42:28 -0800 Subject: [PATCH 67/69] Revert "mm: memcontrol: avoid workload stalls when lowering memory.high" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 536d3bf261a2fc3b05b3e91e7eef7383443015cf, as it can cause writers to memory.high to get stuck in the kernel forever, performing page reclaim and consuming excessive amounts of CPU cycles. Before the patch, a write to memory.high would first put the new limit in place for the workload, and then reclaim the requested delta. After the patch, the kernel tries to reclaim the delta before putting the new limit into place, in order to not overwhelm the workload with a sudden, large excess over the limit. However, if reclaim is actively racing with new allocations from the uncurbed workload, it can keep the write() working inside the kernel indefinitely. This is causing problems in Facebook production. A privileged system-level daemon that adjusts memory.high for various workloads running on a host can get unexpectedly stuck in the kernel and essentially turn into a sort of involuntary kswapd for one of the workloads. We've observed that daemon busy-spin in a write() for minutes at a time, neglecting its other duties on the system, and expending privileged system resources on behalf of a workload. To remedy this, we have first considered changing the reclaim logic to break out after a couple of loops - whether the workload has converged to the new limit or not - and bound the write() call this way. However, the root cause that inspired the sequence change in the first place has been fixed through other means, and so a revert back to the proven limit-setting sequence, also used by memory.max, is preferable. The sequence was changed to avoid extreme latencies in the workload when the limit was lowered: the sudden, large excess created by the limit lowering would erroneously trigger the penalty sleeping code that is meant to throttle excessive growth from below. Allocating threads could end up sleeping long after the write() had already reclaimed the delta for which they were being punished. However, erroneous throttling also caused problems in other scenarios at around the same time. This resulted in commit b3ff92916af3 ("mm, memcg: reclaim more aggressively before high allocator throttling"), included in the same release as the offending commit. When allocating threads now encounter large excess caused by a racing write() to memory.high, instead of entering punitive sleeps, they will simply be tasked with helping reclaim down the excess, and will be held no longer than it takes to accomplish that. This is in line with regular limit enforcement - i.e. if the workload allocates up against or over an otherwise unchanged limit from below. With the patch breaking userspace, and the root cause addressed by other means already, revert it again. Link: https://lkml.kernel.org/r/20210122184341.292461-1-hannes@cmpxchg.org Fixes: 536d3bf261a2 ("mm: memcontrol: avoid workload stalls when lowering memory.high") Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Acked-by: Chris Down Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Michal Koutný Cc: [5.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2de77b5bcc2..913c2b9e5c72 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6271,6 +6271,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; + page_counter_set_high(&memcg->memory, high); + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -6294,10 +6296,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, break; } - page_counter_set_high(&memcg->memory, high); - memcg_wb_domain_size_changed(memcg); - return nbytes; } From 3286222fc609dea27bd16ac02c55d3f1c3190063 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 9 Feb 2021 13:42:32 -0800 Subject: [PATCH 68/69] mm, slub: better heuristic for number of cpus when calculating slab order When creating a new kmem cache, SLUB determines how large the slab pages will based on number of inputs, including the number of CPUs in the system. Larger slab pages mean that more objects can be allocated/free from per-cpu slabs before accessing shared structures, but also potentially more memory can be wasted due to low slab usage and fragmentation. The rough idea of using number of CPUs is that larger systems will be more likely to benefit from reduced contention, and also should have enough memory to spare. Number of CPUs used to be determined as nr_cpu_ids, which is number of possible cpus, but on some systems many will never be onlined, thus commit 045ab8c9487b ("mm/slub: let number of online CPUs determine the slub page order") changed it to nr_online_cpus(). However, for kmem caches created early before CPUs are onlined, this may lead to permamently low slab page sizes. Vincent reports a regression [1] of hackbench on arm64 systems: "I'm facing significant performances regression on a large arm64 server system (224 CPUs). Regressions is also present on small arm64 system (8 CPUs) but in a far smaller order of magnitude On 224 CPUs system : 9 iterations of hackbench -l 16000 -g 16 v5.11-rc4 : 9.135sec (+/- 0.45%) v5.11-rc4 + revert this patch: 3.173sec (+/- 0.48%) v5.10: 3.136sec (+/- 0.40%)" Mel reports a regression [2] of hackbench on x86_64, with lockstat suggesting page allocator contention: "i.e. the patch incurs a 7% to 32% performance penalty. This bisected cleanly yesterday when I was looking for the regression and then found the thread. Numerous caches change size. For example, kmalloc-512 goes from order-0 (vanilla) to order-2 with the revert. So mostly this is down to the number of times SLUB calls into the page allocator which only caches order-0 pages on a per-cpu basis" Clearly num_online_cpus() doesn't work too early in bootup. We could change the order dynamically in a memory hotplug callback, but runtime order changing for existing kmem caches has been already shown as dangerous, and removed in 32a6f409b693 ("mm, slub: remove runtime allocation order changes"). It could be resurrected in a safe manner with some effort, but to fix the regression we need something simpler. We could use num_present_cpus() that should be the number of physically present CPUs even before they are onlined. That would work for PowerPC [3], which triggered the original commit, but that still doesn't work on arm64 [4] as explained in [5]. So this patch tries to determine the best available value without specific arch knowledge. - num_present_cpus() if the number is larger than 1, as that means the arch is likely setting it properly - nr_cpu_ids otherwise This should fix the reported regressions while also keeping the effect of 045ab8c9487b for PowerPC systems. It's possible there are configurations where num_present_cpus() is 1 during boot while nr_cpu_ids is at the same time bloated, so these (if they exist) would keep the large orders based on nr_cpu_ids as was before 045ab8c9487b. [1] https://lore.kernel.org/linux-mm/CAKfTPtA_JgMf_+zdFbcb_V9rM7JBWNPjAz9irgwFj7Rou=xzZg@mail.gmail.com/ [2] https://lore.kernel.org/linux-mm/20210128134512.GF3592@techsingularity.net/ [3] https://lore.kernel.org/linux-mm/20210123051607.GC2587010@in.ibm.com/ [4] https://lore.kernel.org/linux-mm/CAKfTPtAjyVmS5VYvU6DBxg4-JEo5bdmWbngf-03YsY18cmWv_g@mail.gmail.com/ [5] https://lore.kernel.org/linux-mm/20210126230305.GD30941@willie-the-truck/ Link: https://lkml.kernel.org/r/20210208134108.22286-1-vbabka@suse.cz Fixes: 045ab8c9487b ("mm/slub: let number of online CPUs determine the slub page order") Signed-off-by: Vlastimil Babka Reported-by: Vincent Guittot Reported-by: Mel Gorman Tested-by: Mel Gorman Tested-by: Vincent Guittot Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Bharata B Rao Cc: Christoph Lameter Cc: Roman Gushchin Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Jann Horn Cc: Michal Hocko Cc: David Rientjes Cc: Shakeel Butt Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 7ecbbbe5bc0c..b22a4b101c84 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3423,6 +3423,7 @@ static inline int calculate_order(unsigned int size) unsigned int order; unsigned int min_objects; unsigned int max_objects; + unsigned int nr_cpus; /* * Attempt to find best configuration for a slab. This @@ -3433,8 +3434,21 @@ static inline int calculate_order(unsigned int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; - if (!min_objects) - min_objects = 4 * (fls(num_online_cpus()) + 1); + if (!min_objects) { + /* + * Some architectures will only update present cpus when + * onlining them, so don't trust the number if it's just 1. But + * we also don't want to use nr_cpu_ids always, as on some other + * architectures, there can be many possible cpus, but never + * onlined. Here we compromise between trying to avoid too high + * order on systems that appear larger than they are, and too + * low order on systems that appear smaller than they are. + */ + nr_cpus = num_present_cpus(); + if (nr_cpus <= 1) + nr_cpus = nr_cpu_ids; + min_objects = 4 * (fls(nr_cpus) + 1); + } max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); From a35d8f016e0b68634035217d06d1c53863456b50 Mon Sep 17 00:00:00 2001 From: Joachim Henke Date: Tue, 9 Feb 2021 13:42:36 -0800 Subject: [PATCH 69/69] nilfs2: make splice write available again Since 5.10, splice() or sendfile() to NILFS2 return EINVAL. This was caused by commit 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops"). This patch initializes the splice_write field in file_operations, like most file systems do, to restore the functionality. Link: https://lkml.kernel.org/r/1612784101-14353-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Joachim Henke Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: [5.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 64bc81363c6c..e1bd592ce700 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -141,6 +141,7 @@ const struct file_operations nilfs_file_operations = { /* .release = nilfs_release_file, */ .fsync = nilfs_sync_file, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; const struct inode_operations nilfs_file_inode_operations = {