From 4db7384ce55c4d7bfb9876fabd8d8778b2ff90ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 May 2025 14:03:01 -0400 Subject: [PATCH 001/297] btrfs: don't drop a reference if btrfs_check_write_meta_pointer() fails In the zoned mode there's a bug in the extent buffer tree conversion to xarray. The reference for eb is dropped and code continues but the references get dropped by releasing the batch. Reported-by: Johannes Thumshirn Fixes: 19d7f65f032f ("btrfs: convert the buffer_radix to an xarray") Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e43f6280f954..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2189,7 +2189,6 @@ int btree_write_cache_pages(struct address_space *mapping, done = 1; break; } - free_extent_buffer(eb); continue; } From 400123bd0107175e92a9780b97f7a5934eb0a991 Mon Sep 17 00:00:00 2001 From: Andrej Picej Date: Thu, 29 May 2025 07:36:53 +0200 Subject: [PATCH 002/297] dt-bindings: drm/bridge: ti-sn65dsi83: drop $ref to fix lvds-vod* warnings The kernel test robot reported a warning related to the use of "$ref" type definitions for custom endpoint properties - "ti,lvds-vod-swing-clock-microvolt" and - "ti,lvds-vod-swing-data-microvolt". Using "$ref" with "uint32-array" is not correctly handled in this context. Removing "$ref" and relying solely on "maxItems: 2" enforces the intended requirement of specifying exactly two values, without triggering a schema validation warning. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505021937.efnQPPqx-lkp@intel.com/ Signed-off-by: Andrej Picej Link: https://lore.kernel.org/r/20250529053654.1754926-1-andrej.picej@norik.com Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/display/bridge/ti,sn65dsi83.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml index 9b5f3f3eab19..e69b6343a8eb 100644 --- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml @@ -118,15 +118,11 @@ $defs: ti,lvds-vod-swing-clock-microvolt: description: LVDS diferential output voltage for clock lanes in microvolts. - $ref: /schemas/types.yaml#/definitions/uint32-array - minItems: 2 maxItems: 2 ti,lvds-vod-swing-data-microvolt: description: LVDS diferential output voltage for data lanes in microvolts. - $ref: /schemas/types.yaml#/definitions/uint32-array - minItems: 2 maxItems: 2 allOf: From d53fd59707c402d03ec740b4e90a2ddcb5312006 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 7 May 2025 16:59:02 -0500 Subject: [PATCH 003/297] dt-bindings: soc: fsl,ls1028a-reset: Drop extra "/" in $id The $id value has a double "//". Drop it. Fixes: 9ca5a7d9d2e0 ("dt-bindings: soc: fsl: Add fsl,ls1028a-reset for reset syscon node") Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250507215903.2748698-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/soc/fsl/fsl,ls1028a-reset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,ls1028a-reset.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,ls1028a-reset.yaml index 31295be91013..558a8c7aab9c 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,ls1028a-reset.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,ls1028a-reset.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas//soc/fsl/fsl,ls1028a-reset.yaml# +$id: http://devicetree.org/schemas/soc/fsl/fsl,ls1028a-reset.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Freescale Layerscape Reset Registers Module From 87b42c114cdda76c8ad3002f2096699ad5146cb3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 May 2025 11:11:41 +0300 Subject: [PATCH 004/297] cxl: fix return value in cxlctl_validate_set_features() The cxlctl_validate_set_features() function is type bool. It's supposed to return true for valid requests and false for invalid. However, this error path returns ERR_PTR(-EINVAL) which is true when it was intended to return false. The incorrect return will result in kernel failing to prevent a incorrect op_size passed in from userspace to be detected. [ dj: Add user impact to commit log ] Fixes: f76e0bbc8bc3 ("cxl: Update prototype of function get_support_feature_info()") Signed-off-by: Dan Carpenter Reviewed-by: Ira Weiny Link: https://patch.msgid.link/aDbFPSCujpJLY1if@stanley.mountain Signed-off-by: Dave Jiang --- drivers/cxl/core/features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/features.c b/drivers/cxl/core/features.c index 6f2eae1eb126..7c750599ea69 100644 --- a/drivers/cxl/core/features.c +++ b/drivers/cxl/core/features.c @@ -544,7 +544,7 @@ static bool cxlctl_validate_set_features(struct cxl_features_state *cxlfs, u32 flags; if (rpc_in->op_size < sizeof(uuid_t)) - return ERR_PTR(-EINVAL); + return false; feat = cxl_feature_info(cxlfs, &rpc_in->set_feat_in.uuid); if (IS_ERR(feat)) From 6dea74e454c260cd757b53a2f6861fffe6d83308 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 Jun 2025 01:26:54 +0100 Subject: [PATCH 005/297] f2fs: Fix __write_node_folio() conversion This conversion moved the folio_unlock() to inside __write_node_folio(), but missed one caller so we had a double-unlock on this path. Cc: Christoph Hellwig Cc: Chao Yu Cc: Jaegeuk Kim Reported-by: syzbot+c0dc46208750f063d0e0@syzkaller.appspotmail.com Fixes: 80f31d2a7e5f (f2fs: return bool from __write_node_folio) Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1cb4cba7f961..bfe104db284e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2078,7 +2078,6 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { - folio_unlock(folio); folio_batch_release(&fbatch); ret = -EIO; goto out; From 5ae416c5b1e2e816aee7b3fc8347adf70afabb4c Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Fri, 6 Jun 2025 19:49:57 +0100 Subject: [PATCH 006/297] HID: wacom: fix memory leak on kobject creation failure During wacom_initialize_remotes() a fifo buffer is allocated with kfifo_alloc() and later a cleanup action is registered during devm_add_action_or_reset() to clean it up. However if the code fails to create a kobject and register it with sysfs the code simply returns -ENOMEM before the cleanup action is registered leading to a memory leak. Fix this by ensuring the fifo is freed when the kobject creation and registration process fails. Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated") Reviewed-by: Ping Cheng Cc: stable@vger.kernel.org Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index eaf099b2efdb..ec5282bc69d6 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2048,8 +2048,10 @@ static int wacom_initialize_remotes(struct wacom *wacom) remote->remote_dir = kobject_create_and_add("wacom_remote", &wacom->hdev->dev.kobj); - if (!remote->remote_dir) + if (!remote->remote_dir) { + kfifo_free(&remote->remote_fifo); return -ENOMEM; + } error = sysfs_create_files(remote->remote_dir, remote_unpair_attrs); From 1a19ae437ca5d5c7d9ec2678946fb339b1c706bf Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Fri, 6 Jun 2025 19:49:58 +0100 Subject: [PATCH 007/297] HID: wacom: fix memory leak on sysfs attribute creation failure When sysfs_create_files() fails during wacom_initialize_remotes() the fifo buffer is not freed leading to a memory leak. Fix this by calling kfifo_free() before returning. Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated") Reviewed-by: Ping Cheng Cc: stable@vger.kernel.org Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index ec5282bc69d6..58cbd43a37e9 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2058,6 +2058,7 @@ static int wacom_initialize_remotes(struct wacom *wacom) if (error) { hid_err(wacom->hdev, "cannot create sysfs group err: %d\n", error); + kfifo_free(&remote->remote_fifo); return error; } From 85a720f4337f0ddf1603c8b75a8f1ffbbe022ef9 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Fri, 6 Jun 2025 19:49:59 +0100 Subject: [PATCH 008/297] HID: wacom: fix kobject reference count leak When sysfs_create_files() fails in wacom_initialize_remotes() the error is returned and the cleanup action will not have been registered yet. As a result the kobject???s refcount is never dropped, so the kobject can never be freed leading to a reference leak. Fix this by calling kobject_put() before returning. Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated") Acked-by: Ping Cheng Cc: stable@vger.kernel.org Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 58cbd43a37e9..1257131b1e34 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2059,6 +2059,7 @@ static int wacom_initialize_remotes(struct wacom *wacom) hid_err(wacom->hdev, "cannot create sysfs group err: %d\n", error); kfifo_free(&remote->remote_fifo); + kobject_put(remote->remote_dir); return error; } From 4a0381080397e77792a5168069f174d3e56175ff Mon Sep 17 00:00:00 2001 From: "Daniel J. Ogorchock" Date: Tue, 13 May 2025 03:47:00 -0400 Subject: [PATCH 009/297] HID: nintendo: avoid bluetooth suspend/resume stalls Ensure we don't stall or panic the kernel when using bluetooth-connected controllers. This was reported as an issue on android devices using kernel 6.6 due to the resume hook which had been added for usb joycons. First, set a new state value to JOYCON_CTLR_STATE_SUSPENDED in a newly-added nintendo_hid_suspend. This makes sure we will not stall out the kernel waiting for input reports during led classdev suspend. The stalls could happen if connectivity is unreliable or lost to the controller prior to suspend. Second, since we lose connectivity during suspend, do not try joycon_init() for bluetooth controllers in the nintendo_hid_resume path. Tested via multiple suspend/resume flows when using the controller both in USB and bluetooth modes. Signed-off-by: Daniel J. Ogorchock Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-nintendo.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c index 839d5bcd72b1..fb4985988615 100644 --- a/drivers/hid/hid-nintendo.c +++ b/drivers/hid/hid-nintendo.c @@ -308,6 +308,7 @@ enum joycon_ctlr_state { JOYCON_CTLR_STATE_INIT, JOYCON_CTLR_STATE_READ, JOYCON_CTLR_STATE_REMOVED, + JOYCON_CTLR_STATE_SUSPENDED, }; /* Controller type received as part of device info */ @@ -2750,14 +2751,46 @@ static void nintendo_hid_remove(struct hid_device *hdev) static int nintendo_hid_resume(struct hid_device *hdev) { - int ret = joycon_init(hdev); + struct joycon_ctlr *ctlr = hid_get_drvdata(hdev); + int ret; + hid_dbg(hdev, "resume\n"); + if (!joycon_using_usb(ctlr)) { + hid_dbg(hdev, "no-op resume for bt ctlr\n"); + ctlr->ctlr_state = JOYCON_CTLR_STATE_READ; + return 0; + } + + ret = joycon_init(hdev); if (ret) - hid_err(hdev, "Failed to restore controller after resume"); + hid_err(hdev, + "Failed to restore controller after resume: %d\n", + ret); + else + ctlr->ctlr_state = JOYCON_CTLR_STATE_READ; return ret; } +static int nintendo_hid_suspend(struct hid_device *hdev, pm_message_t message) +{ + struct joycon_ctlr *ctlr = hid_get_drvdata(hdev); + + hid_dbg(hdev, "suspend: %d\n", message.event); + /* + * Avoid any blocking loops in suspend/resume transitions. + * + * joycon_enforce_subcmd_rate() can result in repeated retries if for + * whatever reason the controller stops providing input reports. + * + * This has been observed with bluetooth controllers which lose + * connectivity prior to suspend (but not long enough to result in + * complete disconnection). + */ + ctlr->ctlr_state = JOYCON_CTLR_STATE_SUSPENDED; + return 0; +} + #endif static const struct hid_device_id nintendo_hid_devices[] = { @@ -2796,6 +2829,7 @@ static struct hid_driver nintendo_hid_driver = { #ifdef CONFIG_PM .resume = nintendo_hid_resume, + .suspend = nintendo_hid_suspend, #endif }; static int __init nintendo_init(void) From 73f3a7415d93cf418c7625d03bce72da84344406 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Wed, 14 May 2025 14:26:38 +0800 Subject: [PATCH 010/297] HID: Intel-thc-hid: Intel-quicki2c: Enhance QuickI2C reset flow During customer board enabling, it was found: some touch devices prepared reset response, but either forgot sending interrupt or THC missed reset interrupt because of timing issue. THC QuickI2C driver depends on interrupt to read reset response, in this case, it will cause driver waiting timeout. This patch enhances the flow by adding manually reset response reading after waiting for reset interrupt timeout. Signed-off-by: Even Xu Tested-by: Chong Han Fixes: 66b59bfce6d9 ("HID: intel-thc-hid: intel-quicki2c: Complete THC QuickI2C driver") Signed-off-by: Jiri Kosina --- .../intel-quicki2c/quicki2c-protocol.c | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-protocol.c b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-protocol.c index f493df0d5dc4..a63f8c833252 100644 --- a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-protocol.c +++ b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-protocol.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "intel-thc-dev.h" #include "intel-thc-dma.h" @@ -200,6 +201,9 @@ int quicki2c_set_report(struct quicki2c_device *qcdev, u8 report_type, int quicki2c_reset(struct quicki2c_device *qcdev) { + u16 input_reg = le16_to_cpu(qcdev->dev_desc.input_reg); + size_t read_len = HIDI2C_LENGTH_LEN; + u32 prd_len = read_len; int ret; qcdev->reset_ack = false; @@ -213,12 +217,32 @@ int quicki2c_reset(struct quicki2c_device *qcdev) ret = wait_event_interruptible_timeout(qcdev->reset_ack_wq, qcdev->reset_ack, HIDI2C_RESET_TIMEOUT * HZ); - if (ret <= 0 || !qcdev->reset_ack) { + if (qcdev->reset_ack) + return 0; + + /* + * Manually read reset response if it wasn't received, in case reset interrupt + * was missed by touch device or THC hardware. + */ + ret = thc_tic_pio_read(qcdev->thc_hw, input_reg, read_len, &prd_len, + (u32 *)qcdev->input_buf); + if (ret) { + dev_err_once(qcdev->dev, "Read Reset Response failed, ret %d\n", ret); + return ret; + } + + /* + * Check response packet length, it's first 16 bits of packet. + * If response packet length is zero, it's reset response, otherwise not. + */ + if (get_unaligned_le16(qcdev->input_buf)) { dev_err_once(qcdev->dev, "Wait reset response timed out ret:%d timeout:%ds\n", ret, HIDI2C_RESET_TIMEOUT); return -ETIMEDOUT; } + qcdev->reset_ack = true; + return 0; } From 54bae4c17c11688339eb73a04fd24203bb6e7494 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Tue, 6 May 2025 13:50:15 +0800 Subject: [PATCH 011/297] HID: quirks: Add quirk for 2 Chicony Electronics HP 5MP Cameras The Chicony Electronics HP 5MP Cameras (USB ID 04F2:B824 & 04F2:B82C) report a HID sensor interface that is not actually implemented. Attempting to access this non-functional sensor via iio_info causes system hangs as runtime PM tries to wake up an unresponsive sensor. Add these 2 devices to the HID ignore list since the sensor interface is non-functional by design and should not be exposed to userspace. Signed-off-by: Chia-Lin Kao (AceLan) Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 2 ++ drivers/hid/hid-quirks.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index e3fb4e2fe911..f1e00641e26e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -312,6 +312,8 @@ #define USB_DEVICE_ID_ASUS_AK1D 0x1125 #define USB_DEVICE_ID_CHICONY_TOSHIBA_WT10A 0x1408 #define USB_DEVICE_ID_CHICONY_ACER_SWITCH12 0x1421 +#define USB_DEVICE_ID_CHICONY_HP_5MP_CAMERA 0xb824 +#define USB_DEVICE_ID_CHICONY_HP_5MP_CAMERA2 0xb82c #define USB_VENDOR_ID_CHUNGHWAT 0x2247 #define USB_DEVICE_ID_CHUNGHWAT_MULTITOUCH 0x0001 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 7fefeb413ec3..96f3c712acc8 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -757,6 +757,8 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_AVERMEDIA, USB_DEVICE_ID_AVER_FM_MR800) }, { HID_USB_DEVICE(USB_VENDOR_ID_AXENTIA, USB_DEVICE_ID_AXENTIA_FM_RADIO) }, { HID_USB_DEVICE(USB_VENDOR_ID_BERKSHIRE, USB_DEVICE_ID_BERKSHIRE_PCWD) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_HP_5MP_CAMERA) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_HP_5MP_CAMERA2) }, { HID_USB_DEVICE(USB_VENDOR_ID_CIDC, 0x0103) }, { HID_USB_DEVICE(USB_VENDOR_ID_CYGNAL, USB_DEVICE_ID_CYGNAL_RADIO_SI470X) }, { HID_USB_DEVICE(USB_VENDOR_ID_CYGNAL, USB_DEVICE_ID_CYGNAL_RADIO_SI4713) }, From fa10d4515817274a50af510d5d283d3c7fffc1ae Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 23 May 2025 11:10:07 -0500 Subject: [PATCH 012/297] HID: input: lower message severity of 'No inputs registered, leaving' to debug Plugging in a "Blue snowball" microphone always shows the error 'No inputs registered, leaving', but the device functions as intended. When a HID device is started using the function hid_hw_start() and the argument HID_CONNECT_DEFAULT it will try all various hid connect requests. Not all devices will create an input device and so the message is needlessly noisy. Decrease it to debug instead. [jkosina@suse.com: edit shortlog] Signed-off-by: Mario Limonciello Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 9d80635a91eb..ff1784b5c2a4 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -2343,7 +2343,7 @@ int hidinput_connect(struct hid_device *hid, unsigned int force) } if (list_empty(&hid->inputs)) { - hid_err(hid, "No inputs registered, leaving\n"); + hid_dbg(hid, "No inputs registered, leaving\n"); goto out_unwind; } From 1a8953f4f7746c6a515989774fe03047c522c613 Mon Sep 17 00:00:00 2001 From: Zhang Heng Date: Thu, 5 Jun 2025 15:29:59 +0800 Subject: [PATCH 013/297] HID: Add IGNORE quirk for SMARTLINKTECHNOLOGY MARTLINKTECHNOLOGY is a microphone device, when the HID interface in an audio device is requested to get specific report id, the following error may occur. [ 562.939373] usb 1-1.4.1.2: new full-speed USB device number 21 using xhci_hcd [ 563.104908] usb 1-1.4.1.2: New USB device found, idVendor=4c4a, idProduct=4155, bcdDevice= 1.00 [ 563.104910] usb 1-1.4.1.2: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 563.104911] usb 1-1.4.1.2: Product: USB Composite Device [ 563.104912] usb 1-1.4.1.2: Manufacturer: SmartlinkTechnology [ 563.104913] usb 1-1.4.1.2: SerialNumber: 20201111000001 [ 563.229499] input: SmartlinkTechnology USB Composite Device as /devices/pci0000:00/0000:00:07.1/0000:04:00.3/usb1/1-1/1-1.4/1-1.4.1/1-1.4.1.2/1-1.4.1.2:1.2/0003:4C4A:4155.000F/input/input35 [ 563.291505] hid-generic 0003:4C4A:4155.000F: input,hidraw2: USB HID v2.01 Keyboard [SmartlinkTechnology USB Composite Device] on usb-0000:04:00.3-1.4.1.2/input2 [ 563.291557] usbhid 1-1.4.1.2:1.3: couldn't find an input interrupt endpoint [ 568.506654] usb 1-1.4.1.2: 1:1: usb_set_interface failed (-110) [ 573.626656] usb 1-1.4.1.2: 1:1: usb_set_interface failed (-110) [ 578.746657] usb 1-1.4.1.2: 1:1: usb_set_interface failed (-110) [ 583.866655] usb 1-1.4.1.2: 1:1: usb_set_interface failed (-110) [ 588.986657] usb 1-1.4.1.2: 1:1: usb_set_interface failed (-110) Ignore HID interface. The device is working properly. Signed-off-by: Zhang Heng Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/hid-quirks.c | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index f1e00641e26e..2d3769405ec3 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1527,4 +1527,7 @@ #define USB_VENDOR_ID_SIGNOTEC 0x2133 #define USB_DEVICE_ID_SIGNOTEC_VIEWSONIC_PD1011 0x0018 +#define USB_VENDOR_ID_SMARTLINKTECHNOLOGY 0x4c4a +#define USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155 0x4155 + #endif diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 96f3c712acc8..31508da93ba2 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -906,6 +906,7 @@ static const struct hid_device_id hid_ignore_list[] = { #endif { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) }, + { HID_USB_DEVICE(USB_VENDOR_ID_SMARTLINKTECHNOLOGY, USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155) }, { } }; From 9327e3ee5b077c4ab4495a09b67624f670ed88b6 Mon Sep 17 00:00:00 2001 From: Iusico Maxim Date: Thu, 5 Jun 2025 19:55:50 +0200 Subject: [PATCH 014/297] HID: lenovo: Restrict F7/9/11 mode to compact keyboards only Commit 2f2bd7cbd1d1 ("hid: lenovo: Resend all settings on reset_resume for compact keyboards") introduced a regression for ThinkPad TrackPoint Keyboard II by removing the conditional check for enabling F7/9/11 mode needed for compact keyboards only. As a result, the non-compact keyboards can no longer toggle Fn-lock via Fn+Esc, although it can be controlled via sysfs knob that directly sends raw commands. This patch restores the previous conditional check without any additions. Cc: stable@vger.kernel.org Fixes: 2f2bd7cbd1d1 ("hid: lenovo: Resend all settings on reset_resume for compact keyboards") Signed-off-by: Iusico Maxim Signed-off-by: Jiri Kosina --- drivers/hid/hid-lenovo.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index af29ba840522..a3c23a72316a 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -548,11 +548,14 @@ static void lenovo_features_set_cptkbd(struct hid_device *hdev) /* * Tell the keyboard a driver understands it, and turn F7, F9, F11 into - * regular keys + * regular keys (Compact only) */ - ret = lenovo_send_cmd_cptkbd(hdev, 0x01, 0x03); - if (ret) - hid_warn(hdev, "Failed to switch F7/9/11 mode: %d\n", ret); + if (hdev->product == USB_DEVICE_ID_LENOVO_CUSBKBD || + hdev->product == USB_DEVICE_ID_LENOVO_CBTKBD) { + ret = lenovo_send_cmd_cptkbd(hdev, 0x01, 0x03); + if (ret) + hid_warn(hdev, "Failed to switch F7/9/11 mode: %d\n", ret); + } /* Switch middle button to native mode */ ret = lenovo_send_cmd_cptkbd(hdev, 0x09, 0x01); From 0e97f5b6a0808fa2ea865280708511c817d4bca3 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Tue, 10 Jun 2025 10:01:31 +0800 Subject: [PATCH 015/297] hid: intel-ish-hid: Use PCI_DEVICE_DATA() macro for ISH device table Replace the usage of PCI_VDEVICE() with driver_data assignment in the ISH PCI device table with the PCI_DEVICE_DATA() macro. This improves code readability. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index ff0fc8010072..0db41ed74a14 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -67,9 +67,9 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_MTL_P)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_H)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_S)}, - {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_LNL_M), .driver_data = ISHTP_DRIVER_DATA_LNL_M}, - {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_H), .driver_data = ISHTP_DRIVER_DATA_PTL}, - {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_P), .driver_data = ISHTP_DRIVER_DATA_PTL}, + {PCI_DEVICE_DATA(INTEL, ISH_LNL_M, ISHTP_DRIVER_DATA_LNL_M)}, + {PCI_DEVICE_DATA(INTEL, ISH_PTL_H, ISHTP_DRIVER_DATA_PTL)}, + {PCI_DEVICE_DATA(INTEL, ISH_PTL_P, ISHTP_DRIVER_DATA_PTL)}, {} }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); From 5cdb49a680b45f467e9d915c0e74756bc0c67c57 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Tue, 10 Jun 2025 10:01:32 +0800 Subject: [PATCH 016/297] HID: intel-ish-hid: ipc: Add Wildcat Lake PCI device ID Add device ID of Wildcat Lake into ishtp support list. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/hw-ish.h | 1 + drivers/hid/intel-ish-hid/ipc/pci-ish.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index 07e90d51f073..fa5d68c36313 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -38,6 +38,7 @@ #define PCI_DEVICE_ID_INTEL_ISH_LNL_M 0xA845 #define PCI_DEVICE_ID_INTEL_ISH_PTL_H 0xE345 #define PCI_DEVICE_ID_INTEL_ISH_PTL_P 0xE445 +#define PCI_DEVICE_ID_INTEL_ISH_WCL 0x4D45 #define REVISION_ID_CHT_A0 0x6 #define REVISION_ID_CHT_Ax_SI 0x0 diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 0db41ed74a14..c57483224db6 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -27,10 +27,12 @@ enum ishtp_driver_data_index { ISHTP_DRIVER_DATA_NONE, ISHTP_DRIVER_DATA_LNL_M, ISHTP_DRIVER_DATA_PTL, + ISHTP_DRIVER_DATA_WCL, }; #define ISH_FW_GEN_LNL_M "lnlm" #define ISH_FW_GEN_PTL "ptl" +#define ISH_FW_GEN_WCL "wcl" #define ISH_FIRMWARE_PATH(gen) "intel/ish/ish_" gen ".bin" #define ISH_FIRMWARE_PATH_ALL "intel/ish/ish_*.bin" @@ -42,6 +44,9 @@ static struct ishtp_driver_data ishtp_driver_data[] = { [ISHTP_DRIVER_DATA_PTL] = { .fw_generation = ISH_FW_GEN_PTL, }, + [ISHTP_DRIVER_DATA_WCL] = { + .fw_generation = ISH_FW_GEN_WCL, + }, }; static const struct pci_device_id ish_pci_tbl[] = { @@ -70,6 +75,7 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_DEVICE_DATA(INTEL, ISH_LNL_M, ISHTP_DRIVER_DATA_LNL_M)}, {PCI_DEVICE_DATA(INTEL, ISH_PTL_H, ISHTP_DRIVER_DATA_PTL)}, {PCI_DEVICE_DATA(INTEL, ISH_PTL_P, ISHTP_DRIVER_DATA_PTL)}, + {PCI_DEVICE_DATA(INTEL, ISH_WCL, ISHTP_DRIVER_DATA_WCL)}, {} }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); From e0eb1b6b0cd29ca7793c501d5960fd36ba11f110 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 2 Jun 2025 20:48:44 -0700 Subject: [PATCH 017/297] riscv: vdso: Exclude .rodata from the PT_DYNAMIC segment .rodata is implicitly included in the PT_DYNAMIC segment due to inheriting the segment of the preceding .dynamic section (in both GNU ld and LLD). When the .rodata section's size is not a multiple of 16 bytes on riscv64, llvm-readelf will report a "PT_DYNAMIC dynamic table is invalid" warning. Note: in the presence of the .dynamic section, GNU readelf and llvm-readelf's -d option decodes the dynamic section using the section. This issue arose after commit 8f8c1ff879fab60f80f3a7aec3000f47e5b03ba9 ("riscv: vdso.lds.S: remove hardcoded 0x800 .text start addr"), which placed .rodata directly after .dynamic by removing .eh_frame. This patch resolves the implicit inclusion into PT_DYNAMIC by explicitly specifying the :text output section phdr. Reported-by: Nathan Chancellor Closes: https://github.com/ClangBuiltLinux/linux/issues/2093 Signed-off-by: Fangrui Song Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250602-riscv-vdso-v1-1-0620cf63cff0@maskray.me Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/vdso.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 7c15b0f4ee3b..c29ef12a63bb 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -30,7 +30,7 @@ SECTIONS *(.data .data.* .gnu.linkonce.d.*) *(.dynbss) *(.bss .bss.* .gnu.linkonce.b.*) - } + } :text .note : { *(.note.*) } :text :note From fdc9be90929081ed5a9658583aa5f3121d5e8365 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 3 Jun 2025 18:43:13 +0800 Subject: [PATCH 018/297] cxl/edac: Fix the min_scrub_cycle of a region miscalculation When trying to update the scrub_cycle value of a cxl region, which means updating the scrub_cycle value of each memdev under a cxl region. cxl driver needs to guarantee the new scrub_cycle value is greater than the min_scrub_cycle value of a memdev, otherwise the updating operation will fail(Per Table 8-223 in CXL r3.2 section 8.2.10.9.11.1). Current implementation logic of getting the min_scrub_cycle value of a cxl region is that getting the min_scrub_cycle value of each memdevs under the cxl region, then using the minimum min_scrub_cycle value as the region's min_scrub_cycle. Checking if the new scrub_cycle value is greater than this value. If yes, updating the new scrub_cycle value to each memdevs. The issue is that the new scrub_cycle value is possibly greater than the minimum min_scrub_cycle value of all memdevs but less than the maximum min_scrub_cycle value of all memdevs if memdevs have a different min_scrub_cycle value. The updating operation will always fail on these memdevs which have a greater min_scrub_cycle than the new scrub_cycle. The correct implementation logic is to get the maximum value of these memdevs' min_scrub_cycle, check if the new scrub_cycle value is greater than the value. If yes, the new scrub_cycle value is fit for the region. The change also impacts the result of cxl_patrol_scrub_get_min_scrub_cycle(), the interface returned the minimum min_scrub_cycle value among all memdevs under the region before the change. The interface will return the maximum min_scrub_cycle value among all memdevs under the region with the change. Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Shiju Jose Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20250603104314.25569-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/edac.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c index 2cbc664e5d62..0ef245d0bd9f 100644 --- a/drivers/cxl/core/edac.c +++ b/drivers/cxl/core/edac.c @@ -103,10 +103,10 @@ static int cxl_scrub_get_attrbs(struct cxl_patrol_scrub_context *cxl_ps_ctx, u8 *cap, u16 *cycle, u8 *flags, u8 *min_cycle) { struct cxl_mailbox *cxl_mbox; - u8 min_scrub_cycle = U8_MAX; struct cxl_region_params *p; struct cxl_memdev *cxlmd; struct cxl_region *cxlr; + u8 min_scrub_cycle = 0; int i, ret; if (!cxl_ps_ctx->cxlr) { @@ -133,8 +133,12 @@ static int cxl_scrub_get_attrbs(struct cxl_patrol_scrub_context *cxl_ps_ctx, if (ret) return ret; + /* + * The min_scrub_cycle of a region is the max of minimum scrub + * cycles supported by memdevs that back the region. + */ if (min_cycle) - min_scrub_cycle = min(*min_cycle, min_scrub_cycle); + min_scrub_cycle = max(*min_cycle, min_scrub_cycle); } if (min_cycle) From 85cc50bfcb8b08c9304925b66cd2bc83c1c765bf Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 3 Jun 2025 18:43:14 +0800 Subject: [PATCH 019/297] cxl/Documentation: Add more description about min/max scrub cycle user can configurare scrub cycle for a region or a memory device via sysfs interface. Currently, these interfaces have not enough description for the return value. So adding return value description to these interfaces. Suggested-by: Alison Schofield Signed-off-by: Shiju Jose Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20250603104314.25569-2-ming.li@zohomail.com Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-edac-scrub | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-edac-scrub b/Documentation/ABI/testing/sysfs-edac-scrub index c43be90deab4..ab6014743da5 100644 --- a/Documentation/ABI/testing/sysfs-edac-scrub +++ b/Documentation/ABI/testing/sysfs-edac-scrub @@ -49,6 +49,12 @@ Description: (RO) Supported minimum scrub cycle duration in seconds by the memory scrubber. + Device-based scrub: returns the minimum scrub cycle + supported by the memory device. + + Region-based scrub: returns the max of minimum scrub cycles + supported by individual memory devices that back the region. + What: /sys/bus/edac/devices//scrubX/max_cycle_duration Date: March 2025 KernelVersion: 6.15 @@ -57,6 +63,16 @@ Description: (RO) Supported maximum scrub cycle duration in seconds by the memory scrubber. + Device-based scrub: returns the maximum scrub cycle supported + by the memory device. + + Region-based scrub: returns the min of maximum scrub cycles + supported by individual memory devices that back the region. + + If the memory device does not provide maximum scrub cycle + information, return the maximum supported value of the scrub + cycle field. + What: /sys/bus/edac/devices//scrubX/current_cycle_duration Date: March 2025 KernelVersion: 6.15 From f3054152c12e2eed1e72704aff47b0ea58229584 Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Mon, 19 May 2025 10:54:46 +0200 Subject: [PATCH 020/297] HID: wacom: fix crash in wacom_aes_battery_handler() Commit fd2a9b29dc9c ("HID: wacom: Remove AES power_supply after extended inactivity") introduced wacom_aes_battery_handler() which is scheduled as a delayed work (aes_battery_work). In wacom_remove(), aes_battery_work is not canceled. Consequently, if the device is removed while aes_battery_work is still pending, then hard crashes or "Oops: general protection fault..." are experienced when wacom_aes_battery_handler() is finally called. E.g., this happens with built-in USB devices after resume from hibernate when aes_battery_work was still pending at the time of hibernation. So, take care to cancel aes_battery_work in wacom_remove(). Fixes: fd2a9b29dc9c ("HID: wacom: Remove AES power_supply after extended inactivity") Signed-off-by: Thomas Zeitlhofer Acked-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 1257131b1e34..9a57504e51a1 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2905,6 +2905,7 @@ static void wacom_remove(struct hid_device *hdev) hid_hw_stop(hdev); cancel_delayed_work_sync(&wacom->init_work); + cancel_delayed_work_sync(&wacom->aes_battery_work); cancel_work_sync(&wacom->wireless_work); cancel_work_sync(&wacom->battery_work); cancel_work_sync(&wacom->remote_work); From 8d90d9872edae7e78c3a12b98e239bfaa66f3639 Mon Sep 17 00:00:00 2001 From: Charles Mirabile Date: Fri, 30 May 2025 17:14:22 -0400 Subject: [PATCH 021/297] riscv: fix runtime constant support for nommu kernels the `__runtime_fixup_32` function does not handle the case where `val` is zero correctly (as might occur when patching a nommu kernel and referring to a physical address below the 4GiB boundary whose upper 32 bits are all zero) because nothing in the existing logic prevents the code from taking the `else` branch of both nop-checks and emitting two `nop` instructions. This leaves random garbage in the register that is supposed to receive the upper 32 bits of the pointer instead of zero that when combined with the value for the lower 32 bits yields an invalid pointer and causes a kernel panic when that pointer is eventually accessed. The author clearly considered the fact that if the `lui` is converted into a `nop` that the second instruction needs to be adjusted to become an `li` instead of an `addi`, hence introducing the `addi_insn_mask` variable, but didn't follow that logic through fully to the case where the `else` branch executes. To fix it just adjust the logic to ensure that the second `else` branch is not taken if the first instruction will be patched to a `nop`. Fixes: a44fb5722199 ("riscv: Add runtime constant support") Signed-off-by: Charles Mirabile Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20250530211422.784415-2-cmirabil@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/runtime-const.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/runtime-const.h b/arch/riscv/include/asm/runtime-const.h index 451fd76b8811..d766e2b9e6df 100644 --- a/arch/riscv/include/asm/runtime-const.h +++ b/arch/riscv/include/asm/runtime-const.h @@ -206,7 +206,7 @@ static inline void __runtime_fixup_32(__le16 *lui_parcel, __le16 *addi_parcel, u addi_insn_mask &= 0x07fff; } - if (lower_immediate & 0x00000fff) { + if (lower_immediate & 0x00000fff || lui_insn == RISCV_INSN_NOP4) { /* replace upper 12 bits of addi with lower 12 bits of val */ addi_insn &= addi_insn_mask; addi_insn |= (lower_immediate & 0x00000fff) << 20; From ed2a6ff0234e21cd5e56b63d8bff80120bbe5f15 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 1 May 2025 20:14:26 -0500 Subject: [PATCH 022/297] dt-bindings: serial: Convert altr,juart-1.0 to DT schema Convert the Altera JTAG UART binding to DT schema. The "ALTR,juart-1.0" compatible has long been deprecated, so drop it. Signed-off-by: Rob Herring (Arm) --- .../bindings/serial/altera_jtaguart.txt | 5 ----- .../bindings/serial/altr,juart-1.0.yaml | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) delete mode 100644 Documentation/devicetree/bindings/serial/altera_jtaguart.txt create mode 100644 Documentation/devicetree/bindings/serial/altr,juart-1.0.yaml diff --git a/Documentation/devicetree/bindings/serial/altera_jtaguart.txt b/Documentation/devicetree/bindings/serial/altera_jtaguart.txt deleted file mode 100644 index 55a901051e8f..000000000000 --- a/Documentation/devicetree/bindings/serial/altera_jtaguart.txt +++ /dev/null @@ -1,5 +0,0 @@ -Altera JTAG UART - -Required properties: -- compatible : should be "ALTR,juart-1.0" -- compatible : should be "altr,juart-1.0" diff --git a/Documentation/devicetree/bindings/serial/altr,juart-1.0.yaml b/Documentation/devicetree/bindings/serial/altr,juart-1.0.yaml new file mode 100644 index 000000000000..02e20fa591da --- /dev/null +++ b/Documentation/devicetree/bindings/serial/altr,juart-1.0.yaml @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/serial/altr,juart-1.0.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Altera JTAG UART + +maintainers: + - Dinh Nguyen + +properties: + compatible: + const: altr,juart-1.0 + +required: + - compatible + +additionalProperties: false From f75794b6077ec729f57de9a1ad24f14d288a68bb Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 1 May 2025 20:15:40 -0500 Subject: [PATCH 023/297] dt-bindings: serial: Convert altr,uart-1.0 to DT schema Convert the Altera JTAG UART binding to DT schema. The "ALTR,uart-1.0" compatible has long been deprecated, so drop it. Signed-off-by: Rob Herring (Arm) --- .../bindings/serial/altera_uart.txt | 8 ------ .../bindings/serial/altr,uart-1.0.yaml | 25 +++++++++++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) delete mode 100644 Documentation/devicetree/bindings/serial/altera_uart.txt create mode 100644 Documentation/devicetree/bindings/serial/altr,uart-1.0.yaml diff --git a/Documentation/devicetree/bindings/serial/altera_uart.txt b/Documentation/devicetree/bindings/serial/altera_uart.txt deleted file mode 100644 index 81bf7ffb1a81..000000000000 --- a/Documentation/devicetree/bindings/serial/altera_uart.txt +++ /dev/null @@ -1,8 +0,0 @@ -Altera UART - -Required properties: -- compatible : should be "ALTR,uart-1.0" -- compatible : should be "altr,uart-1.0" - -Optional properties: -- clock-frequency : frequency of the clock input to the UART diff --git a/Documentation/devicetree/bindings/serial/altr,uart-1.0.yaml b/Documentation/devicetree/bindings/serial/altr,uart-1.0.yaml new file mode 100644 index 000000000000..72d4972e1e22 --- /dev/null +++ b/Documentation/devicetree/bindings/serial/altr,uart-1.0.yaml @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/serial/altr,uart-1.0.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Altera UART + +maintainers: + - Dinh Nguyen + +allOf: + - $ref: /schemas/serial/serial.yaml# + +properties: + compatible: + const: altr,uart-1.0 + + clock-frequency: + description: Frequency of the clock input to the UART. + +required: + - compatible + +unevaluatedProperties: false From 4262bd0d9cc704ea1365ac00afc1272400c2cbef Mon Sep 17 00:00:00 2001 From: Han Gao Date: Fri, 23 May 2025 18:25:56 +0800 Subject: [PATCH 024/297] riscv: vector: Fix context save/restore with xtheadvector Previously only v0-v7 were correctly saved/restored, and the context of v8-v31 are damanged. Correctly save/restore v8-v31 to avoid breaking userspace. Fixes: d863910eabaf ("riscv: vector: Support xtheadvector save/restore") Cc: stable@vger.kernel.org Signed-off-by: Han Gao Tested-by: Xiongchuan Tan Reviewed-by: Charlie Jenkins Reviewed-by: Yanteng Si Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/9b9eb2337f3d5336ce813721f8ebea51e0b2b553.1747994822.git.rabenda.cn@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vector.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 45c9b426fcc5..b61786d43c20 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -205,11 +205,11 @@ static inline void __riscv_v_vstate_save(struct __riscv_v_ext_state *save_to, THEAD_VSETVLI_T4X0E8M8D1 THEAD_VSB_V_V0T0 "add t0, t0, t4\n\t" - THEAD_VSB_V_V0T0 + THEAD_VSB_V_V8T0 "add t0, t0, t4\n\t" - THEAD_VSB_V_V0T0 + THEAD_VSB_V_V16T0 "add t0, t0, t4\n\t" - THEAD_VSB_V_V0T0 + THEAD_VSB_V_V24T0 : : "r" (datap) : "memory", "t0", "t4"); } else { asm volatile ( @@ -241,11 +241,11 @@ static inline void __riscv_v_vstate_restore(struct __riscv_v_ext_state *restore_ THEAD_VSETVLI_T4X0E8M8D1 THEAD_VLB_V_V0T0 "add t0, t0, t4\n\t" - THEAD_VLB_V_V0T0 + THEAD_VLB_V_V8T0 "add t0, t0, t4\n\t" - THEAD_VLB_V_V0T0 + THEAD_VLB_V_V16T0 "add t0, t0, t4\n\t" - THEAD_VLB_V_V0T0 + THEAD_VLB_V_V24T0 : : "r" (datap) : "memory", "t0", "t4"); } else { asm volatile ( From 2b9518684f8558cd61a7c608cc03a27822cf7b03 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 6 Jun 2025 17:24:44 +0800 Subject: [PATCH 025/297] RISC-V: vDSO: Correct inline assembly constraints in the getrandom syscall wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recently pointed out by Thomas, if a register is forced for two different register variables, among them one is used as "+" (both input and output) and another is only used as input, Clang would treat the conflicting input parameters as undefined behaviour and optimize away the argument assignment. Instead use "=r" (only output) for the output parameter and "r" (only input) for the input parameter. While the example from the GCC documentation uses "0" for the input parameter, this is not necessary as confirmed by the GCC developers and "r" matches what the other architectures' vDSO implementations are using. [ alex: Update log to match v2 (Thomas) ] Link: https://lore.kernel.org/all/20250603-loongarch-vdso-syscall-v1-1-6d12d6dfbdd0@linutronix.de/ Link: https://gcc.gnu.org/onlinedocs/gcc-15.1.0/gcc/Local-Register-Variables.html Link: https://gcc.gnu.org/pipermail/gcc-help/2025-June/144266.html Cc: Thomas Weißschuh Cc: Nathan Chancellor Signed-off-by: Xi Ruoyao Reviewed-by: Thomas Weißschuh Fixes: ee0d03053e70 ("RISC-V: vDSO: Wire up getrandom() vDSO") Link: https://lore.kernel.org/r/20250606092443.73650-2-xry111@xry111.site Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vdso/getrandom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h index 8dc92441702a..c6d66895c1f5 100644 --- a/arch/riscv/include/asm/vdso/getrandom.h +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -18,7 +18,7 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns register unsigned int flags asm("a2") = _flags; asm volatile ("ecall\n" - : "+r" (ret) + : "=r" (ret) : "r" (nr), "r" (buffer), "r" (len), "r" (flags) : "memory"); From bc75552b80e6683b2def5a0459433607ea4788f5 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Tue, 10 Jun 2025 18:12:32 +0800 Subject: [PATCH 026/297] raid6: riscv: Fix NULL pointer dereference caused by a missing clobber When running the raid6 user-space test program on RISC-V QEMU, there's a segmentation fault which seems caused by accessing a NULL pointer, which is the pointer variable p/q in raid6_rvv*_gen/xor_syndrome_real(), p/q should have been equal to dptr[x], but when I use GDB command to see its value, which was 0x10 like below: " Program received signal SIGSEGV, Segmentation fault. 0x0000000000011062 in raid6_rvv2_xor_syndrome_real (disks=, start=0, stop=, bytes=4096, ptrs=) at rvv.c:386 (gdb) p p $1 = (u8 *) 0x10 " The issue was found to be related with: 1) Compile optimization There's no segmentation fault if compiling the raid6test program with the optimization flag -O0. 2) The RISC-V vector command vsetvli If not used t0 as the first parameter in vsetvli, there's no segmentation fault either. This patch selects the 2nd solution to fix the issue. [Palmer: The actual issue here is a missing clobber in the vsetvli code. It's a little tricky: we've already probed for VLENB so we don't need to look at the output register, we just need to have an X register in the instruction as that's the form required to actually set VL. Thus we clobber a register, and without describing that we end up breaking compilers.] Fixes: 6093faaf9593 ("raid6: Add RISC-V SIMD syndrome and recovery calculations") Signed-off-by: Chunyan Zhang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250610101234.1100660-3-zhangchunyan@iscas.ac.cn Signed-off-by: Palmer Dabbelt --- lib/raid6/rvv.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c index f0887344b274..7d82efa5b14f 100644 --- a/lib/raid6/rvv.c +++ b/lib/raid6/rvv.c @@ -26,9 +26,9 @@ static int rvv_has_vector(void) static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) { u8 **dptr = (u8 **)ptrs; - unsigned long d; - int z, z0; u8 *p, *q; + unsigned long vl, d; + int z, z0; z0 = disks - 3; /* Highest data disk */ p = dptr[z0 + 1]; /* XOR parity */ @@ -36,8 +36,9 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ @@ -99,7 +100,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, { u8 **dptr = (u8 **)ptrs; u8 *p, *q; - unsigned long d; + unsigned long vl, d; int z, z0; z0 = stop; /* P/Q right side optimization */ @@ -108,8 +109,9 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ @@ -195,9 +197,9 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) { u8 **dptr = (u8 **)ptrs; - unsigned long d; - int z, z0; u8 *p, *q; + unsigned long vl, d; + int z, z0; z0 = disks - 3; /* Highest data disk */ p = dptr[z0 + 1]; /* XOR parity */ @@ -205,8 +207,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* @@ -287,7 +290,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, { u8 **dptr = (u8 **)ptrs; u8 *p, *q; - unsigned long d; + unsigned long vl, d; int z, z0; z0 = stop; /* P/Q right side optimization */ @@ -296,8 +299,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* @@ -413,9 +417,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) { u8 **dptr = (u8 **)ptrs; - unsigned long d; - int z, z0; u8 *p, *q; + unsigned long vl, d; + int z, z0; z0 = disks - 3; /* Highest data disk */ p = dptr[z0 + 1]; /* XOR parity */ @@ -423,8 +427,9 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* @@ -539,7 +544,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, { u8 **dptr = (u8 **)ptrs; u8 *p, *q; - unsigned long d; + unsigned long vl, d; int z, z0; z0 = stop; /* P/Q right side optimization */ @@ -548,8 +553,9 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* @@ -721,9 +727,9 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) { u8 **dptr = (u8 **)ptrs; - unsigned long d; - int z, z0; u8 *p, *q; + unsigned long vl, d; + int z, z0; z0 = disks - 3; /* Highest data disk */ p = dptr[z0 + 1]; /* XOR parity */ @@ -731,8 +737,9 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* @@ -915,7 +922,7 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, { u8 **dptr = (u8 **)ptrs; u8 *p, *q; - unsigned long d; + unsigned long vl, d; int z, z0; z0 = stop; /* P/Q right side optimization */ @@ -924,8 +931,9 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" - "vsetvli t0, x0, e8, m1, ta, ma\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" ".option pop\n" + : "=&r" (vl) ); /* From 2aa5801ada29948ce510fc8b1e3b3ec8162423e2 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 10 Jun 2025 14:30:58 -0700 Subject: [PATCH 027/297] RISC-V: uaccess: Wrap the get_user_8 uaccess macro I must have lost this rebasing things during the merge window, I know I got it at some point but it's not here now. Without this I get warnings along the lines of include/linux/fs.h:3975:15: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 3975 | if (unlikely(get_user(c, path))) | ^ arch/riscv/include/asm/uaccess.h:274:3: note: expanded from macro 'get_user' 274 | __get_user((x), __p) : \ | ^ arch/riscv/include/asm/uaccess.h:244:2: note: expanded from macro '__get_user' 244 | __get_user_error(__gu_val, __gu_ptr, __gu_err); \ | ^ arch/riscv/include/asm/uaccess.h:207:2: note: expanded from macro '__get_user_error' 207 | __ge LD [M] net/802/psnap.ko t_user_nocheck(x, ptr, __gu_failed); \ | ^ arch/riscv/include/asm/uaccess.h:196:3: note: expanded from macro '__get_user_nocheck' 196 | __get_user_8((x), __gu_ptr, label); \ | ^ arch/riscv/include/asm/uaccess.h:130:2: note: expanded from macro '__get_user_8' 130 | u32 __user *__ptr = (u32 __user *)(ptr); \ | ^ Link: https://lore.kernel.org/r/20250610213058.24852-1-palmer@dabbelt.com Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Fixes: f6bff7827a48 ("riscv: uaccess: use 'asm_goto_output' for get_user()") Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index d472da4450e6..525e50db24f7 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -127,6 +127,7 @@ do { \ #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_user_8(x, ptr, label) \ +do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u32 __lo, __hi; \ asm_goto_output( \ @@ -141,7 +142,7 @@ do { \ : : label); \ (x) = (__typeof__(x))((__typeof__((x) - (x)))( \ (((u64)__hi << 32) | __lo))); \ - +} while (0) #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ #define __get_user_8(x, ptr, label) \ do { \ From a403fe6c0b17f472e01246eb350f5eef105243ac Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 13 Jun 2025 09:16:48 +0800 Subject: [PATCH 028/297] cxl/edac: Fix potential memory leak issues In cxl_store_rec_gen_media() and cxl_store_rec_dram(), use kmemdup() to duplicate a cxl gen_media/dram event to store the event in a xarray by xa_store(). The cxl gen_media/dram event allocated by kmemdup() should be freed in the case that the xa_store() fails. Fixes: 0b5ccb0de1e2 ("cxl/edac: Support for finding memory operation attributes from the current boot") Signed-off-by: Li Ming Tested-by: Shiju Jose Reviewed-by: Shiju Jose Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250613011648.102840-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/edac.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c index 0ef245d0bd9f..d725ee954199 100644 --- a/drivers/cxl/core/edac.c +++ b/drivers/cxl/core/edac.c @@ -1103,8 +1103,10 @@ int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt) old_rec = xa_store(&array_rec->rec_gen_media, le64_to_cpu(rec->media_hdr.phys_addr), rec, GFP_KERNEL); - if (xa_is_err(old_rec)) + if (xa_is_err(old_rec)) { + kfree(rec); return xa_err(old_rec); + } kfree(old_rec); @@ -1131,8 +1133,10 @@ int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt) old_rec = xa_store(&array_rec->rec_dram, le64_to_cpu(rec->media_hdr.phys_addr), rec, GFP_KERNEL); - if (xa_is_err(old_rec)) + if (xa_is_err(old_rec)) { + kfree(rec); return xa_err(old_rec); + } kfree(old_rec); From 3c70ec71abdaf4e4fa48cd8fdfbbd864d78235a8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 12 Jun 2025 12:20:43 -0700 Subject: [PATCH 029/297] cxl/ras: Fix CPER handler device confusion By inspection, cxl_cper_handle_prot_err() is making a series of fragile assumptions that can lead to crashes: 1/ It assumes that endpoints identified in the record are a CXL-type-3 device, nothing guarantees that. 2/ It assumes that the device is bound to the cxl_pci driver, nothing guarantees that. 3/ Minor, it holds the device lock over the switch-port tracing for no reason as the trace is 100% generated from data in the record. Correct those by checking that the PCIe endpoint parents a cxl_memdev before assuming the format of the driver data, and move the lock to where it is required. Consequently this also makes the implementation ready for CXL accelerators that are not bound to cxl_pci. Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors") Cc: Terry Bowman Cc: Li Ming Cc: Alison Schofield Cc: Ira Weiny Cc: Tony Luck Reviewed-by: Smita Koralahalli Reviewed-by: Dave Jiang Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Link: https://patch.msgid.link/20250612192043.2254617-1-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/ras.c | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 485a831695c7..2731ba3a0799 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, ras_cap.header_log); } -static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, - struct cxl_ras_capability_regs ras_cap) +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd, + struct cxl_ras_capability_regs ras_cap) { u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; - struct cxl_dev_state *cxlds; - cxlds = pci_get_drvdata(pdev); - if (!cxlds) - return; - - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + trace_cxl_aer_correctable_error(cxlmd, status); } -static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, - struct cxl_ras_capability_regs ras_cap) +static void +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd, + struct cxl_ras_capability_regs ras_cap) { u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; - struct cxl_dev_state *cxlds; u32 fe; - cxlds = pci_get_drvdata(pdev); - if (!cxlds) - return; - if (hweight32(status) > 1) fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, ras_cap.cap_control)); else fe = status; - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, + trace_cxl_aer_uncorrectable_error(cxlmd, status, fe, ras_cap.header_log); } +static int match_memdev_by_parent(struct device *dev, const void *uport) +{ + if (is_cxl_memdev(dev) && dev->parent == uport) + return 1; + return 0; +} + static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) { unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, @@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, data->prot_err.agent_addr.bus, devfn); + struct cxl_memdev *cxlmd; int port_type; if (!pdev) return; - guard(device)(&pdev->dev); - port_type = pci_pcie_type(pdev); if (port_type == PCI_EXP_TYPE_ROOT_PORT || port_type == PCI_EXP_TYPE_DOWNSTREAM || @@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) return; } + guard(device)(&pdev->dev); + if (!pdev->dev.driver) + return; + + struct device *mem_dev __free(put_device) = bus_find_device( + &cxl_bus_type, NULL, pdev, match_memdev_by_parent); + if (!mem_dev) + return; + + cxlmd = to_cxl_memdev(mem_dev); if (data->severity == AER_CORRECTABLE) - cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); + cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap); else - cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap); + cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); } static void cxl_cper_prot_err_work_fn(struct work_struct *work) From 0f4dd2ce352d38c7ecf1b3821c908816eb6376a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Jun 2025 18:26:48 -0400 Subject: [PATCH 030/297] bcachefs: trace_extent_trim_atomic Add a tracepoint for when we insert only part of an extent, due to too many overwrites. Signed-off-by: Kent Overstreet --- fs/bcachefs/extent_update.c | 13 ++++++++++++- fs/bcachefs/trace.h | 5 +++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b899ee75f5b9..e76e58a568bf 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -139,6 +139,17 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, if (ret) return ret; - bch2_cut_back(end, k); + /* tracepoint */ + + if (bpos_lt(end, k->k.p)) { + if (trace_extent_trim_atomic_enabled()) { + CLASS(printbuf, buf)(); + bch2_bpos_to_text(&buf, end); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + trace_extent_trim_atomic(trans->c, buf.buf); + } + bch2_cut_back(end, k); + } return 0; } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index dc09532796af..41efebdd06ef 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1490,6 +1490,11 @@ DEFINE_EVENT(fs_str, io_move_evacuate_bucket, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, extent_trim_atomic, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS TRACE_EVENT(update_by_path, From 3bd6f8aeae3d3f8121cbae5a8650a46622aa4e07 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Jun 2025 18:27:37 -0400 Subject: [PATCH 031/297] bcachefs: btree iter tracepoints We've been seeing some livelock-ish behavior in the index update part of the main write path, and while we've got low level btree path tracepoints, we've been lacking high level btree iterator tracepoints. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 67 +++++++++++++++++++++++++++++++++++++--- fs/bcachefs/trace.h | 20 ++++++++++++ 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b78403376c07..b586ecf2fdfa 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2326,6 +2326,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct } bch2_btree_iter_verify(trans, iter); + + if (trace___btree_iter_peek_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace___btree_iter_peek(trans->c, buf.buf); + } + return k; } @@ -2484,6 +2498,19 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree bch2_btree_iter_verify_entry_exit(iter); + if (trace_btree_iter_peek_max_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_max(trans->c, buf.buf); + } + return k; end: bch2_btree_iter_set_pos(trans, iter, end); @@ -2724,6 +2751,19 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); + + if (trace_btree_iter_peek_prev_min_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_prev_min(trans->c, buf.buf); + } return k; end: bch2_btree_iter_set_pos(trans, iter, end); @@ -2767,8 +2807,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; + if (iter->pos.inode == KEY_INODE_MAX) { + k = bkey_s_c_null; + goto out2; + } bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } @@ -2785,8 +2827,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) - return bkey_s_c_null; + if (unlikely(!btree_path_node(path, path->level))) { + k = bkey_s_c_null; + goto out2; + } btree_path_set_should_be_locked(trans, path); @@ -2879,7 +2923,20 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre bch2_btree_iter_verify(trans, iter); ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) - return bkey_s_c_err(ret); + k = bkey_s_c_err(ret); +out2: + if (trace_btree_iter_peek_slot_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_slot(trans->c, buf.buf); + } return k; } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 41efebdd06ef..e759c9ff3965 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1495,6 +1495,26 @@ DEFINE_EVENT(fs_str, extent_trim_atomic, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, btree_iter_peek_slot, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, __btree_iter_peek, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_max, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS TRACE_EVENT(update_by_path, From 9b9a3270092bf8030dbe21ce90b2d0c8d98d33c7 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 13 Jun 2025 21:19:50 +0800 Subject: [PATCH 032/297] bcachefs: Don't allocate new memory when mempool is exhausted Allocating new memory when mempool is exhausted is too complicated, just return ENOMEM is fine. memcpy is not needed, since there might be pointers point to the old memory, that's the bug. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b586ecf2fdfa..55de2e474705 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3217,28 +3217,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long } if (trans->used_mempool) { - if (trans->mem_bytes >= new_bytes) - goto out_change_top; - - /* No more space from mempool item, need malloc new one */ - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_mem); - return ERR_PTR(ret); - } - } - memcpy(new_mem, trans->mem, trans->mem_top); - trans->used_mempool = false; - mempool_free(trans->mem, &c->btree_trans_mem_pool); - goto out_new_mem; + EBUG_ON(trans->mem_bytes >= new_bytes); + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); } new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); @@ -3249,7 +3229,6 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; - memcpy(new_mem, trans->mem, trans->mem_top); trans->used_mempool = true; kfree(trans->mem); } @@ -3264,7 +3243,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long if (ret) return ERR_PTR(ret); } -out_new_mem: + trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -3273,7 +3252,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long return ERR_PTR(btree_trans_restart_ip(trans, BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } -out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); p = trans->mem + trans->mem_top; From 9b54efe66c9b44e7446e8a81a058c014cd43661d Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 13 Jun 2025 22:54:59 +0800 Subject: [PATCH 033/297] bcachefs: Fix alloc_req use after free Now the alloc_req is allocated from the bump allocator, if there is reallocation, the memory of alloc_req would be frees, fix by delaying the reallocation to transaction restart, it has to restart anyway. Reported-by: syzbot+2887a13a5c387e616a68@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 48 +++++++++++++++++++++++++++------------ fs/bcachefs/btree_types.h | 1 + 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 55de2e474705..e2747c57ba2a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3216,25 +3216,32 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long mutex_unlock(&s->lock); } - if (trans->used_mempool) { + if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { EBUG_ON(trans->mem_bytes >= new_bytes); return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); } - new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (old_bytes) { + trans->realloc_bytes_required = new_bytes; + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); + } + + EBUG_ON(trans->mem); + + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + new_mem = kmalloc(new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; trans->used_mempool = true; - kfree(trans->mem); } - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + EBUG_ON(!new_mem); trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -3247,14 +3254,6 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long trans->mem = new_mem; trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - - bch2_trans_kmalloc_trace(trans, size, ip); - p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3315,6 +3314,27 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; + if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { + EBUG_ON(!trans->mem || !trans->mem_bytes); + unsigned new_bytes = trans->realloc_bytes_required; + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + + if (!new_mem) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + kfree(trans->mem); + } + } + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + } + trans_for_each_path(trans, path, i) { path->should_be_locked = false; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 3aa4a602bd02..112170fd9c8f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -497,6 +497,7 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; + unsigned realloc_bytes_required; #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_trans_kmalloc_trace trans_kmalloc_trace; #endif From e31144f8cbe041a572ca64d9261ceeb647ccf557 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 13 Jun 2025 03:01:58 +0800 Subject: [PATCH 034/297] bcachefs: Add missing EBUG_ON Just like the EBUG_ON in bch2_journal_add_entry(). Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index d9710801e3ee..55952143f0d3 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -757,6 +757,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, btree_trans_journal_entries_start(trans), trans->journal_entries.u64s); + EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); + trans->journal_res.offset += trans->journal_entries.u64s; trans->journal_res.u64s -= trans->journal_entries.u64s; From 0dc8eaebed99ea323ab3be7cf1438277f990189d Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 13 Jun 2025 03:01:59 +0800 Subject: [PATCH 035/297] bcachefs: Delay calculation of trans->journal_u64s When there is commit error that need split btree leaf, fsck might change the value of trans->journal_entries.u64s, when retry commit, the value of trans->journal_u64s would be incorrect, which will lead to trans->journal_res.u64s underflow, and then out of bounds write will occur: [ 464.496970][T11969] Call trace: [ 464.496973][T11969] show_stack+0x3c/0x88 (C) [ 464.496995][T11969] dump_stack_lvl+0xf8/0x178 [ 464.497014][T11969] dump_stack+0x20/0x30 [ 464.497031][T11969] __bch2_trans_log_str+0x344/0x350 [ 464.497048][T11969] bch2_trans_log_str+0x3c/0x60 [ 464.497065][T11969] __bch2_fsck_err+0x11bc/0x1390 [ 464.497083][T11969] bch2_check_discard_freespace_key+0xad4/0x10d0 [ 464.497100][T11969] bch2_bucket_alloc_freelist+0x99c/0x1130 [ 464.497117][T11969] bch2_bucket_alloc_trans+0x79c/0xcb8 [ 464.497133][T11969] bch2_bucket_alloc_set_trans+0x378/0xc20 [ 464.497151][T11969] __open_bucket_add_buckets+0x7fc/0x1c00 [ 464.497168][T11969] open_bucket_add_buckets+0x184/0x3a8 [ 464.497185][T11969] bch2_alloc_sectors_start_trans+0xa04/0x1da0 [ 464.497203][T11969] bch2_btree_reserve_get+0x6e0/0xef0 [ 464.497220][T11969] bch2_btree_update_start+0x1618/0x2600 [ 464.497239][T11969] bch2_btree_split_leaf+0xcc/0x730 [ 464.497258][T11969] bch2_trans_commit_error+0x22c/0xc30 [ 464.497276][T11969] __bch2_trans_commit+0x207c/0x4e30 [ 464.497292][T11969] bch2_journal_replay+0x9e0/0x1420 [ 464.497305][T11969] __bch2_run_recovery_passes+0x458/0xf98 [ 464.497318][T11969] bch2_run_recovery_passes+0x280/0x478 [ 464.497331][T11969] bch2_fs_recovery+0x24f0/0x3a28 [ 464.497344][T11969] bch2_fs_start+0xb80/0x1248 [ 464.497358][T11969] bch2_fs_get_tree+0xe94/0x1708 [ 464.497377][T11969] vfs_get_tree+0x84/0x2d0 Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 55952143f0d3..61107f2310ab 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1005,6 +1005,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; + unsigned journal_u64s = 0; int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -1033,10 +1034,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); + journal_u64s = jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; @@ -1056,11 +1057,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) continue; /* we're going to journal the key being updated: */ - trans->journal_u64s += jset_u64s(i->k->k.u64s); + journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(i->old_k.u64s); + journal_u64s += jset_u64s(i->old_k.u64s); } if (trans->extra_disk_res) { @@ -1078,6 +1079,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ From 0e62fca2a6dbfcedaab4919d7ad2044f20fdf889 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 14:53:42 -0400 Subject: [PATCH 036/297] bcachefs: Fix bch2_journal_keys_peek_prev_min() this code is rarely invoked, so - we had a few bugs left from basing it off of bch2_journal_keys_peek_max()... Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index cf7398751644..de996c848e43 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -141,8 +141,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + while (*idx < keys->nr && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) >= 0) { (*idx)++; iters++; if (iters == 10) { From 425da82c63e3ac06b2f8780879e83463e88cbf9f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 14:57:34 -0400 Subject: [PATCH 037/297] bcachefs: btree_iter: fix updates, journal overlay We need to start searching from search_key - _not_ path->pos, which will point to the key we found in the btree Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e2747c57ba2a..061603999ae5 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2076,14 +2076,14 @@ inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter static noinline void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) + struct bpos search_key, struct bkey_s_c *k) { struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, iter->pos) && + bpos_le(i->k->k.p, search_key) && bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2092,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_ static noinline void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2100,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, path->pos) && + bpos_ge(i->k->k.p, search_key) && bpos_le(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2122,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_pos, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, - path->pos, + search_pos, end_pos, &iter->journal_idx); } @@ -2138,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); if (k) { iter->k = k->k; @@ -2151,11 +2153,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, static noinline void btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, + bch2_btree_journal_peek(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -2165,13 +2168,14 @@ void btree_trans_peek_journal(struct btree_trans *trans, static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, path->level, - path->pos, + search_key, end_pos, &iter->journal_idx); } @@ -2179,11 +2183,12 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, static noinline void btree_trans_peek_prev_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, + bch2_btree_journal_peek_prev(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { @@ -2292,11 +2297,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, &k); + btree_trans_peek_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, &k); + bch2_btree_trans_peek_updates(trans, iter, search_key, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2584,11 +2589,11 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, &k); + btree_trans_peek_prev_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, &k); + bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); if (likely(k.k && !bkey_deleted(k.k))) { break; From f2ed0892732d470abde7a1af360bd670dc8a68c6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 15:17:37 -0400 Subject: [PATCH 038/297] bcachefs: better __bch2_snapshot_is_ancestor() assert Previously, we weren't checking the result of the skiplist walk, just the is_ancestor bitmap. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 23a332d76b32..38aeaa128d27 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -135,7 +135,9 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - bool ret; +#ifdef CONFIG_BCACHEFS_DEBUG + u32 orig_id = id; +#endif guard(rcu)(); struct snapshot_table *t = rcu_dereference(c->snapshots); @@ -147,11 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - ret = id && id < ancestor + bool ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); return ret; } From 2ba562cc04dc22d01750e80d0638cc1d91fdd95c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 15:15:09 -0400 Subject: [PATCH 039/297] bcachefs: pass last_seq into fs_journal_start() Prep work for journal rewind, where the seq we're replaying from may be different than the last journal entry's last_seq. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 18 ++++++------------ fs/bcachefs/journal.h | 2 +- fs/bcachefs/recovery.c | 4 ++-- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index dda802a656cf..df71af0013ba 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1474,14 +1474,13 @@ void bch2_fs_journal_stop(struct journal *j) clear_bit(JOURNAL_running, &j->flags); } -int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - u64 last_seq = cur_seq, nr, seq; /* * @@ -1495,17 +1494,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) return -EINVAL; } - genradix_for_each_reverse(&c->journal_entries, iter, _i) { - i = *_i; + /* Clean filesystem? */ + if (!last_seq) + last_seq = cur_seq; - if (journal_replay_ignore(i)) - continue; - - last_seq = le64_to_cpu(i->j.last_seq); - break; - } - - nr = cur_seq - last_seq; + u64 nr = cur_seq - last_seq; /* * Extra fudge factor, in case we crashed when the journal pin fifo was @@ -1532,6 +1525,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); + u64 seq; fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 83734fe4331f..977907038d98 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -453,7 +453,7 @@ int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64); +int bch2_fs_journal_start(struct journal *, u64, u64); void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0b21fa6ff062..6aef8b101820 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -965,7 +965,7 @@ int bch2_fs_recovery(struct bch_fs *c) ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, journal_seq); + bch2_fs_journal_start(&c->journal, last_seq, journal_seq); if (ret) goto err; @@ -1181,7 +1181,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - ret = bch2_fs_journal_start(&c->journal, 1); + ret = bch2_fs_journal_start(&c->journal, 1, 1); if (ret) goto err; From c1ccd43b357e157d78c899e61764fc83b4d1dbaa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 17:47:07 -0400 Subject: [PATCH 040/297] bcachefs: Fix "now allowing incompatible features" message Check against version_incompat_allowed, not version_incompat. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6aef8b101820..820249e9c5ea 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -692,7 +692,7 @@ static bool check_version_upgrade(struct bch_fs *c) ret = true; } - if (new_version > c->sb.version_incompat && + if (new_version > c->sb.version_incompat_allowed && c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { struct printbuf buf = PRINTBUF; From c27e5782d957c7242e9cb6405a395b89e8e1d573 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 18:27:33 -0400 Subject: [PATCH 041/297] bcachefs: Fix snapshot_key_missing_inode_snapshot repair When the inode was a whiteout, we were inserting a new whiteout at the wrong (old) snapshot. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 68ed69a255e1..b80a56e19b40 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -903,17 +903,15 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bch_inode_unpacked new = i->inode; - struct bkey_i whiteout; - - new.bi_snapshot = k.k->p.snapshot; - if (!i->whiteout) { + struct bch_inode_unpacked new = i->inode; + new.bi_snapshot = k.k->p.snapshot; ret = __bch2_fsck_write_inode(trans, &new); } else { + struct bkey_i whiteout; bkey_init(&whiteout.k); whiteout.k.type = KEY_TYPE_whiteout; - whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); + whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, &whiteout, BTREE_UPDATE_internal_snapshot_node); From b17d7bdb128c50025fc3eb7a9e57b3c7caa4a5ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 18:49:54 -0400 Subject: [PATCH 042/297] bcachefs: fsck: fix add_inode() the inode btree uses the offset field for the inum, not the inode field. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b80a56e19b40..28ca07c0e029 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -806,7 +806,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, if (!n->whiteout) { return bch2_inode_unpack(inode, &n->inode); } else { - n->inode.bi_inum = inode.k->p.inode; + n->inode.bi_inum = inode.k->p.offset; n->inode.bi_snapshot = inode.k->p.snapshot; return 0; } From 191334400d8031df367eb74b6ab49bc163f6f821 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 18:58:57 -0400 Subject: [PATCH 043/297] bcachefs: fsck: fix extent past end of inode repair Fix the case where we're deleting in a different snapshot and need to emit a whiteout - that requires a regular BTREE_ITER_filter_snapshots iterator. Also, only delete the part of the extent that extents past i_size. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 28ca07c0e029..48810a8e5d4b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1820,18 +1820,39 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; + + if (fsck_err_on(k.k->p.offset > last_block && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter iter2; + struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); + ret = PTR_ERR_OR_ZERO(whiteout); + if (ret) + goto err; - bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); + bkey_init(&whiteout->k); + whiteout->k.p = SPOS(k.k->p.inode, + last_block, + i->inode.bi_snapshot); + bch2_key_resize(&whiteout->k, + min(KEY_SIZE_MAX & (~0 << c->block_bits), + U64_MAX - whiteout->k.p.offset)); + + + /* + * Need a normal (not BTREE_ITER_all_snapshots) + * iterator, if we're deleting in a different + * snapshot and need to emit a whiteout + */ + struct btree_iter iter2; + bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, + bkey_start_pos(&whiteout->k), + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_btree_delete_at(trans, &iter2, + bch2_trans_update(trans, &iter2, whiteout, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); if (ret) From 7360ee47599af91a1d5f4e74d635d9408a54e489 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 11 Jun 2025 22:20:10 +0300 Subject: [PATCH 044/297] s390/pkey: Prevent overflow in size calculation for memdup_user() Number of apqn target list entries contained in 'nr_apqns' variable is determined by userspace via an ioctl call so the result of the product in calculation of size passed to memdup_user() may overflow. In this case the actual size of the allocated area and the value describing it won't be in sync leading to various types of unpredictable behaviour later. Use a proper memdup_array_user() helper which returns an error if an overflow is detected. Note that it is different from when nr_apqns is initially zero - that case is considered valid and should be handled in subsequent pkey_handler implementations. Found by Linux Verification Center (linuxtesting.org). Fixes: f2bbc96e7cfa ("s390/pkey: add CCA AES cipher key support") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Holger Dengler Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20250611192011.206057-1-pchelkin@ispras.ru Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/pkey_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index cef60770f68b..b3fcdcae379e 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -86,7 +86,7 @@ static void *_copy_apqns_from_user(void __user *uapqns, size_t nr_apqns) if (!uapqns || nr_apqns == 0) return NULL; - return memdup_user(uapqns, nr_apqns * sizeof(struct pkey_apqn)); + return memdup_array_user(uapqns, nr_apqns, sizeof(struct pkey_apqn)); } static int pkey_ioctl_genseck(struct pkey_genseck __user *ugs) From 17c3395e25f7db23fa5758c257a372d410d16cfd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 19:16:12 -0400 Subject: [PATCH 045/297] bcachefs: opts.journal_rewind Add a mount option for rewinding the journal, bringing the entire filesystem to where it was at a previous point in time. This is for extreme disaster recovery scenarios - it's not intended as an undelete operation. The option takes a journal sequence number; the desired sequence number can be determined with 'bcachefs list_journal' Caveats: - The 'journal_transaction_names' option must have been enabled (it's on by default). The option controls emitting of extra debug info in the journal, so we can see what individual transactions were doing; It also enables journalling of keys being overwritten, which is what we rely on here. - A full fsck run will be automatically triggered since alloc info will be inconsistent. Only leaf node updates to non-alloc btrees are rewound, since rewinding interior btree updates isn't possible or desirable. - We can't do anything about data that was deleted and overwritten. Lots of metadata updates after the point in time we're rewinding to shouldn't cause a problem, since we segragate data and metadata allocations (this is in order to make repair by btree node scan practical on larger filesystems; there's a small 64-bit per device bitmap in the superblock of device ranges with btree nodes, and we try to keep this small). However, having discards enabled will cause problems, since buckets are discarded as soon as they become empty (this is why we don't implement fstrim: we don't need it). Hopefully, this feature will be a one-off thing that's never used again: this was implemented for recovering from the "vfs i_nlink 0 -> subvol deletion" bug, and that bug was unusually disastrous and additional safeguards have since been implemented. But if it does turn out that we need this more in the future, I'll have to implement an option so that empty buckets aren't discarded immediately - lagging by perhaps 1% of device capacity. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 56 +++++++++++++++++--------- fs/bcachefs/btree_journal_iter_types.h | 5 ++- fs/bcachefs/journal_io.c | 21 ++++++++-- fs/bcachefs/opts.h | 5 +++ fs/bcachefs/recovery.c | 5 +++ 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index de996c848e43..a41fabd06332 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -641,10 +641,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; + int rewind = l->rewind && r->rewind ? -1 : 1; return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); + ((cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset)) * rewind); } void bch2_journal_keys_put(struct bch_fs *c) @@ -713,6 +714,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; size_t nr_read = 0; + u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; + genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -721,28 +724,43 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); - for_each_jset_key(k, entry, &i->j) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; + vstruct_for_each(&i->j, entry) { + bool rewind = !entry->level && + !btree_id_is_alloc(entry->btree_id) && + le64_to_cpu(i->j.seq) >= rewind_seq; - if (darray_push(keys, n)) { - __journal_keys_sort(keys); + if (entry->type != (rewind + ? BCH_JSET_ENTRY_overwrite + : BCH_JSET_ENTRY_btree_keys)) + continue; - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return bch_err_throw(c, ENOMEM_journal_keys_sort); + if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) + continue; + + jset_entry_for_each_key(entry, k) { + struct journal_key n = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .rewind = rewind, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return bch_err_throw(c, ENOMEM_journal_keys_sort); + } + + BUG_ON(darray_push(keys, n)); } - BUG_ON(darray_push(keys, n)); + nr_read++; } - - nr_read++; } } diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h index 8b773823704f..86aacb254fb2 100644 --- a/fs/bcachefs/btree_journal_iter_types.h +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -11,8 +11,9 @@ struct journal_key { u32 journal_offset; enum btree_id btree_id:8; unsigned level:8; - bool allocated; - bool overwritten; + bool allocated:1; + bool overwritten:1; + bool rewind:1; struct journal_key_range_overwritten __rcu * overwritten_range; struct bkey_i *k; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 0b15d71a8d2d..afbf12e8f0c5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -160,6 +160,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (last_seq && c->opts.journal_rewind) + last_seq = min(last_seq, c->opts.journal_rewind); + if (!c->journal.oldest_seq_found_ondisk || le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); @@ -1430,11 +1433,21 @@ int bch2_journal_read(struct bch_fs *c, printbuf_reset(&buf); prt_printf(&buf, "journal read done, replaying entries %llu-%llu", *last_seq, *blacklist_seq - 1); + + /* + * Drop blacklisted entries and entries older than last_seq (or start of + * journal rewind: + */ + u64 drop_before = *last_seq; + if (c->opts.journal_rewind) { + drop_before = min(drop_before, c->opts.journal_rewind); + prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); + } + + *last_seq = drop_before; if (*start_seq != *blacklist_seq) prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); bch_info(c, "%s", buf.buf); - - /* Drop blacklisted entries and entries older than last_seq: */ genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; @@ -1442,7 +1455,7 @@ int bch2_journal_read(struct bch_fs *c, continue; seq = le64_to_cpu(i->j.seq); - if (seq < *last_seq) { + if (seq < drop_before) { journal_replay_free(c, i, false); continue; } @@ -1455,7 +1468,7 @@ int bch2_journal_read(struct bch_fs *c, } } - ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); + ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); if (ret) goto err; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 2a02606254b3..b0a76bd6d6f5 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -379,6 +379,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ + x(journal_rewind, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(0, U64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Rewind journal") \ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 820249e9c5ea..37f2cc1ec2f8 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -757,6 +757,11 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.nochanges) c->opts.read_only = true; + if (c->opts.journal_rewind) { + bch_info(c, "rewinding journal, fsck required"); + c->opts.fsck = true; + } + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; From 10dfe4926de30b550913409d107005278ab47911 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 20:06:33 -0400 Subject: [PATCH 046/297] bcachefs: Kill unused tracepoints Dead code cleanup. Link: https://lore.kernel.org/linux-bcachefs/20250612224059.39fddd07@batman.local.home/ Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 5 +- fs/bcachefs/btree_write_buffer.c | 3 + fs/bcachefs/errcode.h | 5 -- fs/bcachefs/trace.h | 100 +------------------------------ 4 files changed, 9 insertions(+), 104 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 61107f2310ab..639ef75b3dbd 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -595,12 +595,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); - +#if 0 + /* todo: bring back dynamic fault injection */ if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } - +#endif /* * Check if the insert will fit in the leaf node with the write lock * held, otherwise another thread could write the node changing the diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 90b21e61d2b6..21b5c03d1822 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -676,6 +676,9 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); + + /* can we avoid the unconditional restart? */ + trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); ret = bch_err_throw(c, transaction_restart_write_buffer_flush); } err: diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ac3264134a15..86a842f1e88e 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -137,7 +137,6 @@ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ @@ -148,11 +147,8 @@ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ - x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ @@ -241,7 +237,6 @@ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ x(BCH_ERR_journal_res_blocked, journal_stuck) \ x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index e759c9ff3965..9c5a9c551f03 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1080,34 +1080,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim, __entry->must_wait) ); -TRACE_EVENT(trans_restart_journal_preres_get, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned flags), - TP_ARGS(trans, caller_ip, flags), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, flags ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->flags = flags; - ), - - TP_printk("%s %pS %x", __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->flags) -); - +#if 0 +/* todo: bring back dynamic fault injection */ DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); +#endif DEFINE_EVENT(transaction_event, trans_traverse_all, TP_PROTO(struct btree_trans *trans, @@ -1195,19 +1175,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1229,13 +1196,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1294,44 +1254,6 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes) ); -TRACE_EVENT(trans_restart_key_cache_key_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_u64s, - unsigned new_u64s), - TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(enum btree_id, btree_id ) - TRACE_BPOS_entries(pos) - __field(u32, old_u64s ) - __field(u32, new_u64s ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->old_u64s = old_u64s; - __entry->new_u64s = new_u64s; - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_u64s, - __entry->new_u64s) -); - DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), @@ -1927,21 +1849,6 @@ TRACE_EVENT(btree_path_free, __entry->dup_locked) ); -TRACE_EVENT(btree_path_free_trans_begin, - TP_PROTO(btree_path_idx_t path), - TP_ARGS(path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - ), - - TP_fast_assign( - __entry->idx = path; - ), - - TP_printk(" path %3u", __entry->idx) -); - #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ #ifndef _TRACE_BCACHEFS_H @@ -1959,7 +1866,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} -static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} #endif #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ From 7c9cef5f8bf10a803fd0937ea071a93778f1108a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jun 2025 18:01:25 -0400 Subject: [PATCH 047/297] bcachefs: mark more errors autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d06e73884871..56971ebe0f78 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -217,7 +217,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -253,18 +253,18 @@ enum bch_fsck_flags { x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, 0) \ + x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ x(dirent_empty_name, 219, 0) \ x(dirent_val_too_big, 220, 0) \ x(dirent_name_too_long, 221, 0) \ x(dirent_name_embedded_nul, 222, 0) \ x(dirent_name_dot_or_dotdot, 223, 0) \ x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, 0) \ + x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ x(inode_bi_parent_wrong, 226, 0) \ x(dirent_in_missing_dir_inode, 227, 0) \ x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ x(dirent_to_overwritten_inode, 302, 0) \ x(dirent_to_missing_subvol, 230, 0) \ x(dirent_to_itself, 231, 0) \ From d89a34b14df5c205de698c23c3950b2b947cdb97 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Sat, 14 Jun 2025 17:18:07 +0800 Subject: [PATCH 048/297] bcachefs: Move bset size check before csum check In syzbot's crash, the bset's u64s is larger than the btree node. Reported-by: syzbot+bfaeaa8e26281970158d@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d8f3c4c65e90..005d5c94edd0 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -723,12 +723,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, int write, + unsigned offset, int write, struct bch_io_failures *failed, struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; @@ -778,15 +777,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (!write && - btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, NULL, @@ -1151,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); if (first) { + sectors = vstruct_sectors(b->data, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); bool csum_bad = bch2_crc_cmp(b->data->csum, csum); @@ -1178,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); - - sectors = vstruct_sectors(b->data, c->block_bits); } else { + sectors = vstruct_sectors(bne, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); bool csum_bad = bch2_crc_cmp(bne->csum, csum); @@ -1201,14 +1205,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "decrypting btree node: %s", bch2_err_str(ret))) goto fsck_err; } - - sectors = vstruct_sectors(bne, c->block_bits); } b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); + ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); if (ret) goto fsck_err; @@ -2267,7 +2269,7 @@ static void btree_node_write_endio(struct bio *bio) } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors) + struct bset *i) { int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { @@ -2282,7 +2284,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, } ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); + validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -2475,7 +2477,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) /* if we're going to be encrypting, check metadata validity first: */ if (validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; ret = bset_encrypt(c, i, b->written << 9); @@ -2492,7 +2494,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) /* if we're not encrypting, check metadata after checksumming: */ if (!validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; /* From 56be92c63f02e0f6fd855075acb1471ea1c68539 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Sun, 15 Jun 2025 13:41:22 +0800 Subject: [PATCH 049/297] bcachefs: Fix pool->alloc NULL pointer dereference btree_interior_update_pool has not been initialized before the filesystem becomes read-write, thus mempool_alloc in bch2_btree_update_start will trigger pool->alloc NULL pointer dereference in mempool_alloc_noprof Reported-by: syzbot+2f3859bd28f20fa682e6@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 ++- fs/bcachefs/chardev.c | 29 ++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 5a1cede2febf..8043943cdf6a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -767,7 +767,8 @@ struct btree_trans_buf { x(sysfs) \ x(btree_write_buffer) \ x(btree_node_scrub) \ - x(async_recovery_passes) + x(async_recovery_passes) \ + x(ioctl_data) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index fde3c2380e28..5ea89aa2b0c4 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -319,6 +319,7 @@ static int bch2_data_thread(void *arg) ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; } + enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); return 0; } @@ -378,15 +379,24 @@ static long bch2_ioctl_data(struct bch_fs *c, struct bch_data_ctx *ctx; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) + return -EROFS; - if (arg.op >= BCH_DATA_OP_NR || arg.flags) - return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto put_ref; + } + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) { + ret = -EINVAL; + goto put_ref; + } ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + ret = -ENOMEM; + goto put_ref; + } ctx->c = c; ctx->arg = arg; @@ -395,7 +405,12 @@ static long bch2_ioctl_data(struct bch_fs *c, &bcachefs_data_ops, bch2_data_thread); if (ret < 0) - kfree(ctx); + goto cleanup; + return ret; +cleanup: + kfree(ctx); +put_ref: + enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); return ret; } From 03208bd06a61bc2ebc423b6485cbcffecd37af36 Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Sun, 15 Jun 2025 22:15:38 +0530 Subject: [PATCH 050/297] bcachefs: don't return fsck_fix for unfixable node errors in __btree_err After cd3cdb1ef706 ("Single err message for btree node reads"), all errors caused __btree_err to return -BCH_ERR_fsck_fix no matter what the actual error type was if the recovery pass was scanning for btree nodes. This lead to the code continuing despite things like bad node formats when they earlier would have caused a jump to fsck_err, because btree_err only jumps when the return from __btree_err does not match fsck_fix. Ultimately this lead to undefined behavior by attempting to unpack a key based on an invalid format. Make only errors of type -BCH_ERR_btree_node_read_err_fixable cause __btree_err to return -BCH_ERR_fsck_fix when scanning for btree nodes. Reported-by: syzbot+cfd994b9cdf00446fd54@syzkaller.appspotmail.com Fixes: cd3cdb1ef706 ("bcachefs: Single err message for btree node reads") Signed-off-by: Bharadwaj Raju Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 005d5c94edd0..dca5530a2f49 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -557,7 +557,9 @@ static int __btree_err(int ret, const char *fmt, ...) { if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return bch_err_throw(c, fsck_fix); + return ret == -BCH_ERR_btree_node_read_err_fixable + ? bch_err_throw(c, fsck_fix) + : ret; bool have_retry = false; int ret2; From f2a701fd94f161bdca7537284ba218c20181451e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Jun 2025 12:09:24 -0400 Subject: [PATCH 051/297] bcachefs: fsck: Improve check_key_has_inode() Print out more info when we find a key (extent, dirent, xattr) for a missing inode - was there a good inode in an older snapshot, full(ish) list of keys for that missing inode, so we can make better decisions on how to repair. If it looks like it should've been deleted, autofix it. If we ever hit the non-autofix cases, we'll want to write more repair code (possibly reconstituting the inode). Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 108 ++++++++++++++++++++++++++------- fs/bcachefs/sb-errors_format.h | 2 +- 2 files changed, 88 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 48810a8e5d4b..f25d33fb3e23 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1436,6 +1436,7 @@ static int check_key_has_inode(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct btree_iter iter2 = {}; int ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -1445,40 +1446,105 @@ static int check_key_has_inode(struct btree_trans *trans, bool have_inode = i && !i->whiteout; - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) + goto reconstruct; - inode->last_pos.inode--; - ret = bch_err_throw(c, transaction_restart_nested); - goto err; + if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) + goto out; + + prt_printf(&buf, ", "); + + bool have_old_inode = false; + darray_for_each(inode->inodes, i2) + if (!i2->whiteout && + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && + btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { + prt_printf(&buf, "but found good inode in older snapshot\n"); + bch2_inode_unpacked_to_text(&buf, &i2->inode); + prt_newline(&buf); + have_old_inode = true; + break; + } + + struct bkey_s_c k2; + unsigned nr_keys = 0; + + prt_printf(&buf, "found keys:\n"); + + for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, + SPOS(k.k->p.inode, 0, k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, k2, ret) { + nr_keys++; + if (nr_keys <= 10) { + bch2_bkey_val_to_text(&buf, c, k2); + prt_newline(&buf); + } + if (nr_keys >= 100) + break; } - if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, - "key in missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + if (ret) + goto err; - if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o:\n%s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + if (nr_keys > 100) + prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); + else if (nr_keys > 10) + prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); + + if (!have_inode) { + if (fsck_err_on(!have_inode, + trans, key_in_missing_inode, + "key in missing inode%s", buf.buf)) { + /* + * Maybe a deletion that raced with data move, or something + * weird like that? But if we know the inode was deleted, or + * it's just a few keys, we can safely delete them. + * + * If it's many keys, we should probably recreate the inode + */ + if (have_old_inode || nr_keys <= 2) + goto delete; + else + goto reconstruct; + } + } else { + /* + * not autofix, this one would be a giant wtf - bit error in the + * inode corrupting i_mode? + * + * may want to try repairing inode instead of deleting + */ + if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + trans, key_in_wrong_inode_type, + "key for wrong inode mode %o%s", + i->inode.bi_mode, buf.buf)) + goto delete; + } out: err: fsck_err: + bch2_trans_iter_exit(trans, &iter2); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; delete: + /* + * XXX: print out more info + * count up extents for this inode, check if we have different inode in + * an older snapshot version, perhaps decide if we want to reconstitute + */ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; +reconstruct: + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = bch_err_throw(c, transaction_restart_nested); + goto out; } static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 56971ebe0f78..f1aa40542a0e 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -251,7 +251,7 @@ enum bch_fsck_flags { x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, 0) \ + x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ x(key_in_wrong_inode_type, 217, 0) \ x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ x(dirent_empty_name, 219, 0) \ From 1cddad0fcbccc7056036f2a83184a3ca5c62f7d3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Jun 2025 16:43:34 -0400 Subject: [PATCH 052/297] bcachefs: Call bch2_fs_init_rw() early if we'll be going rw kthread creation checks for pending signals, which is _very_ annoying if we have to do a long recovery and don't go rw until we've done significant work. Check if we'll be going rw and pre-allocate kthreads/workqueues. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 10 ++++++++++ fs/bcachefs/recovery_passes.c | 6 +----- fs/bcachefs/recovery_passes.h | 9 +++++++++ fs/bcachefs/super.c | 13 +++++++++++-- fs/bcachefs/super.h | 1 + 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 37f2cc1ec2f8..fa5d1ef5bea6 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -762,6 +762,16 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.fsck = true; } + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + } + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 35ac0d64d73a..c2c18c0a5429 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -217,11 +217,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || - !c->opts.read_only || - !c->sb.clean || - c->opts.recovery_passes || - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { + if (go_rw_in_recovery(c)) { if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); bch2_reconstruct_alloc(c); diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 260571c7105e..2117f0ce1922 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -17,6 +17,15 @@ enum bch_run_recovery_pass_flags { RUN_RECOVERY_PASS_ratelimit = BIT(1), }; +static inline bool go_rw_in_recovery(struct bch_fs *c) +{ + return (c->journal_keys.nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); +} + int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a5b97c9c5163..69c097ff54e7 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -210,7 +210,6 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { @@ -794,7 +793,7 @@ static int bch2_fs_online(struct bch_fs *c) return ret; } -static int bch2_fs_init_rw(struct bch_fs *c) +int bch2_fs_init_rw(struct bch_fs *c) { if (test_bit(BCH_FS_rw_init_done, &c->flags)) return 0; @@ -1015,6 +1014,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, if (ret) goto err; + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + } + #ifdef CONFIG_UNICODE /* Default encoding until we can potentially have more as an option. */ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index dc52f06cb2b9..e90bab9afe78 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -46,6 +46,7 @@ void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); +int bch2_fs_init_rw(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); From 9ba6930ef8e0e00102716f4627896e0be6358d7b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 14:00:40 -0400 Subject: [PATCH 053/297] bcachefs: Fix __bch2_inum_to_path() when crossing subvol boundaries The bch2_subvolume_get_snapshot() call needs to happen before the dirent lookup - the dirent is in the parent subvolume. Also, check for loops. Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 779c22eb3979..7d1068aa998f 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -625,14 +625,26 @@ static int __bch2_inum_to_path(struct btree_trans *trans, { unsigned orig_pos = path->pos; int ret = 0; + DARRAY(subvol_inum) inums = {}; + + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; + } while (true) { - if (!snapshot) { - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); - if (ret) - goto disconnected; + subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; + + if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { + prt_str_reversed(path, "(loop)"); + break; } + ret = darray_push(&inums, n); + if (ret) + goto err; + struct bch_inode_unpacked inode; ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) @@ -650,7 +662,9 @@ static int __bch2_inum_to_path(struct btree_trans *trans, inum = inode.bi_dir; if (inode.bi_parent_subvol) { subvol = inode.bi_parent_subvol; - snapshot = 0; + ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); + if (ret) + goto disconnected; } struct btree_iter d_iter; @@ -662,6 +676,7 @@ static int __bch2_inum_to_path(struct btree_trans *trans, goto disconnected; struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); prt_char(path, '/'); @@ -677,8 +692,10 @@ static int __bch2_inum_to_path(struct btree_trans *trans, goto err; reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + darray_exit(&inums); return 0; err: + darray_exit(&inums); return ret; disconnected: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) From 7029cc4d13453499a88f512720d26c1a0c4e957b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 19:33:59 -0400 Subject: [PATCH 054/297] bcachefs: fsck: Print path when we find a subvol loop Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f25d33fb3e23..0d87aa9b4f23 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2578,6 +2578,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_subvolume) return 0; + subvol_inum start = { + .subvol = k.k->p.offset, + .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), + }; + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { ret = darray_push(&subvol_path, k.k->p.offset); if (ret) @@ -2596,11 +2601,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (darray_u32_has(&subvol_path, parent)) { printbuf_reset(&buf); - prt_printf(&buf, "subvolume loop:\n"); + prt_printf(&buf, "subvolume loop: "); - darray_for_each_reverse(subvol_path, i) - prt_printf(&buf, "%u ", *i); - prt_printf(&buf, "%u", parent); + ret = bch2_inum_to_path(trans, start, &buf); + if (ret) + goto err; if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); From c1ca07a4dd1a1ab17ed651729c37b04af9f75ee8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 14:15:28 -0400 Subject: [PATCH 055/297] bcachefs: fsck: Fix remove_backpointer() for subvol roots The dirent will be in a different snapshot if the inode is a subvolume root. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0d87aa9b4f23..a3f73e7a2113 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -493,10 +493,18 @@ static int remove_backpointer(struct btree_trans *trans, if (!inode->bi_dir) return 0; + u32 snapshot = inode->bi_snapshot; + + if (inode->bi_parent_subvol) { + int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); + if (ret) + return ret; + } + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: bch2_fsck_remove_dirent(trans, d.k->p); From 9fb09ace59b2beab312ec225630ce87ddbec6d79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 14:41:27 -0400 Subject: [PATCH 056/297] bcachefs: fsck: Fix reattach_inode() for subvol roots bch_subvolume.fs_path_parent needs to be updated as well, it should match inode.bi_parent_subvol. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index a3f73e7a2113..6e64af4ccc36 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -372,6 +372,18 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + struct btree_iter subvol_iter; + struct bkey_i_subvolume *subvol = + bch2_bkey_get_mut_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, inode->bi_subvol), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; + bch2_trans_iter_exit(trans, &subvol_iter); + u64 root_inum; ret = subvol_lookup(trans, inode->bi_parent_subvol, &dirent_snapshot, &root_inum); From 3e5ceaa5bfd71ae94d736426f8a27391023182ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 16:13:02 -0400 Subject: [PATCH 057/297] bcachefs: fsck: check_directory_structure runs in reverse order When we find a directory connectivity problem, we should do the repair in the oldest snapshot that has the issue - so that we don't end up duplicating work or making a real mess of things. Oldest snapshot IDs have the highest integer value, so - just walk inodes in reverse order. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6e64af4ccc36..7414a16af990 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2841,7 +2841,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) int bch2_check_directory_structure(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, From 8d6ac82361df55d223186d8cd2ce884310ee5e6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 16:14:59 -0400 Subject: [PATCH 058/297] bcachefs: fsck: additional diagnostics for reattach_inode() Log the inode's new path. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 7414a16af990..f2c8e904acab 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -399,6 +399,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); + lostfound.bi_nlink += S_ISDIR(inode->bi_mode); /* ensure lost+found inode is also present in inode snapshot */ @@ -435,6 +437,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + { + CLASS(printbuf, buf)(); + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, + inode->bi_snapshot, NULL, &buf); + if (ret) + return ret; + + bch_info(c, "reattached at %s", buf.buf); + } + /* * Fix up inodes in child snapshots: if they should also be reattached * update the backpointer field, if they should not be we need to emit From 583ba52a40dd1f9328e2626056f0797d273817e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 16:15:41 -0400 Subject: [PATCH 059/297] bcachefs: fsck: check_subdir_count logs path We can easily go from inode number -> path now, which makes for more useful log messages. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f2c8e904acab..96b9ea58705f 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2066,14 +2066,22 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ continue; } - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; + if (i->inode.bi_nlink != i->count) { + CLASS(printbuf, buf)(); + + lockrestart_do(trans, + bch2_inum_snapshot_to_path(trans, w->last_pos.inode, + i->inode.bi_snapshot, NULL, &buf)); + + if (fsck_err_on(i->inode.bi_nlink != i->count, + trans, inode_dir_wrong_nlink, + "directory with wrong i_nlink: got %u, should be %llu\n%s", + i->inode.bi_nlink, i->count, buf.buf)) { + i->inode.bi_nlink = i->count; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } } } fsck_err: From 495ba899d5a163d4cd50627ab48e3edd80dfa3a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 16:16:38 -0400 Subject: [PATCH 060/297] bcachefs: fsck: Fix check_path_loop() + snapshots A path exists in a particular snapshot: we should do the pathwalk in the snapshot ID of the inode we started from, _not_ change snapshot ID as we walk inodes and dirents. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 62 +++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 96b9ea58705f..73cff24598ed 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2689,19 +2689,13 @@ int bch2_check_subvolume_structure(struct bch_fs *c) return ret; } -struct pathbuf_entry { - u64 inum; - u32 snapshot; -}; - -typedef DARRAY(struct pathbuf_entry) pathbuf; - -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, + u64 inum, u32 snapshot, u32 new_depth) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, p->inum, p->snapshot), 0); + SPOS(0, inum, snapshot), 0); struct bch_inode_unpacked inode; int ret = bkey_err(k) ?: @@ -2720,14 +2714,15 @@ static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_ return ret; } -static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, + u32 snapshot, u32 new_bi_depth) { u32 restart_count = trans->restart_count; int ret = 0; darray_for_each_reverse(*path, i) { ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); bch_err_fn(trans->c, ret); if (ret) break; @@ -2738,26 +2733,19 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 return ret ?: trans_was_restarted(trans, restart_count); } -static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) -{ - darray_for_each(*p, i) - if (i->inum == inum && - i->snapshot == snapshot) - return true; - return false; -} - static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; struct btree_iter inode_iter = {}; - pathbuf path = {}; + darray_u64 path = {}; struct printbuf buf = PRINTBUF; u32 snapshot = inode_k.k->p.snapshot; bool redo_bi_depth = false; u32 min_bi_depth = U32_MAX; int ret = 0; + struct bpos start = inode_k.k->p; + struct bch_inode_unpacked inode; ret = bch2_inode_unpack(inode_k, &inode); if (ret) @@ -2766,9 +2754,9 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; - u32 parent_snapshot = snapshot; - d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) goto out; @@ -2786,15 +2774,10 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) bch2_trans_iter_exit(trans, &dirent_iter); - ret = darray_push(&path, ((struct pathbuf_entry) { - .inum = inode.bi_inum, - .snapshot = snapshot, - })); + ret = darray_push(&path, inode.bi_inum); if (ret) return ret; - snapshot = parent_snapshot; - bch2_trans_iter_exit(trans, &inode_iter); inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, inode.bi_dir, snapshot), 0); @@ -2816,15 +2799,22 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) break; inode = parent_inode; - snapshot = inode_k.k->p.snapshot; redo_bi_depth = true; - if (path_is_dup(&path, inode.bi_inum, snapshot)) { + if (darray_find(path, inode.bi_inum)) { printbuf_reset(&buf); - prt_printf(&buf, "directory structure loop:\n"); - darray_for_each_reverse(path, i) - prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); - prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); + prt_printf(&buf, "directory structure loop in snapshot %u: ", + snapshot); + + ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); + if (ret) + goto out; + + if (c->opts.verbose) { + prt_newline(&buf); + darray_for_each(path, i) + prt_printf(&buf, "%llu ", *i); + } if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); @@ -2844,7 +2834,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) min_bi_depth = 0; if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); + ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); From 6c4897caefc8e4c6f2bf08f8317b955672667909 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 20:08:23 -0400 Subject: [PATCH 061/297] bcachefs: Fix bch2_read_bio_to_text() We can only pass negative error codes to bch2_err_str(); if it's a positive integer it's not an error and we trip an assert. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 04bbdcf58e40..cd184b219a65 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1491,7 +1491,12 @@ void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); prt_printf(out, "context:\t%u\n", rbio->context); - prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); + + int ret = READ_ONCE(rbio->ret); + if (ret < 0) + prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); + else + prt_printf(out, "ret:\t%i\n", ret); prt_printf(out, "flags:\t"); bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); From a766cfbbeb3a74397965a8fa2e9a402026d3e1d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 12 Jun 2025 22:28:56 -0700 Subject: [PATCH 062/297] bpf: Mark dentry->d_inode as trusted_or_null LSM hooks such as security_path_mknod() and security_inode_rename() have access to newly allocated negative dentry, which has NULL d_inode. Therefore, it is necessary to do the NULL pointer check for d_inode. Also add selftests that checks the verifier enforces the NULL pointer check. Signed-off-by: Song Liu Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20250613052857.1992233-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- .../selftests/bpf/progs/verifier_vfs_accept.c | 18 ++++++++++++++++++ .../selftests/bpf/progs/verifier_vfs_reject.c | 15 +++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7027,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7066,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7076,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index a7c0a553aa50..3e2d76ee8050 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Google LLC. */ #include +#include #include #include @@ -82,4 +83,21 @@ int BPF_PROG(path_d_path_from_file_argument, struct file *file) return 0; } +SEC("lsm.s/inode_rename") +__success +int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *inode = new_dentry->d_inode; + ino_t ino; + + if (!inode) + return 0; + ino = inode->i_ino; + if (ino == 0) + return -EACCES; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index d6d3f4fcb24c..4b392c6c8fc4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Google LLC. */ #include +#include #include #include #include @@ -158,4 +159,18 @@ int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f) return 0; } +SEC("lsm.s/inode_rename") +__failure __msg("invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *inode = new_dentry->d_inode; + ino_t ino; + + ino = inode->i_ino; + if (ino == 0) + return -EACCES; + return 0; +} char _license[] SEC("license") = "GPL"; From e1f0e1a45a40f45ed615abf938cbdfeb7793a9ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 10:23:53 -0400 Subject: [PATCH 063/297] bcachefs: Fix restart handling in btree_node_scrub_work() btree node scrub was sometimes failing to rewrite nodes with errors; bch2_btree_node_rewrite() can return a transaction restart and we weren't checking - the lockrestart_do() needs to wrap the entire operation. And there's a better helper it should've been using, bch2_btree_node_rewrite_key(), which makes all this more convenient. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 28 ++++++---------------------- fs/bcachefs/btree_update_interior.c | 11 +++++------ fs/bcachefs/btree_update_interior.h | 3 +++ 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index dca5530a2f49..08b22bddd747 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1986,28 +1986,12 @@ static void btree_node_scrub_work(struct work_struct *work) prt_newline(&err); if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - struct btree_trans *trans = bch2_trans_get(c); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, scrub->btree, - scrub->key.k->k.p, 0, scrub->level - 1, 0); - - struct btree *b; - int ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); - if (ret) - goto err; - - if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { - bch_err(c, "error validating btree node during scrub on %s at btree %s", - scrub->ca->name, err.buf); - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_begin(trans); - bch2_trans_put(trans); + int ret = bch2_trans_do(c, + bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, + scrub->key.k, 0)); + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); } printbuf_exit(&err); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e77584607f0d..7bf1bd6a6e92 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2293,9 +2293,9 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, goto out; } -static int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) +int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, @@ -2367,9 +2367,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work) int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, a->btree_id, a->level, a->key.k, 0)); - if (ret != -ENOENT && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_shutdown) + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) bch_err_fn_ratelimited(c, ret); spin_lock(&c->btree_node_rewrites_lock); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index b649c36c3fbb..ac04e45a8515 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -176,6 +176,9 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned, unsigned); +int bch2_btree_node_rewrite_key(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_i *, unsigned); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, struct bpos, unsigned, unsigned); From 7f8073cfb04a97842fe891ca50dad60afd1e3121 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 13 Jun 2025 17:53:04 +0200 Subject: [PATCH 064/297] s390/ptrace: Fix pointer dereferencing in regs_get_kernel_stack_nth() The recent change which added READ_ONCE_NOCHECK() to read the nth entry from the kernel stack incorrectly dropped dereferencing of the stack pointer in order to read the requested entry. In result the address of the entry is returned instead of its content. Dereference the pointer again to fix this. Reported-by: Will Deacon Closes: https://lore.kernel.org/r/20250612163331.GA13384@willie-the-truck Fixes: d93a855c31b7 ("s390/ptrace: Avoid KASAN false positives in regs_get_kernel_stack_nth()") Cc: stable@vger.kernel.org Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 62c0ab4a4b9d..0905fa99a31e 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -265,7 +265,7 @@ static __always_inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *r addr = kernel_stack_pointer(regs) + n * sizeof(long); if (!regs_within_kernel_stack(regs, addr)) return 0; - return READ_ONCE_NOCHECK(addr); + return READ_ONCE_NOCHECK(*(unsigned long *)addr); } /** From bbc3a0b17a890aa19bddd0f9b08e8af488f1ec94 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 13:08:35 -0400 Subject: [PATCH 065/297] bcachefs: fsck: Fix check_directory_structure when no check_dirents check_directory_structure runs after check_dirents, so it expects that it won't see any inodes with missing backpointers - normally. But online fsck can't run check_dirents yet, or the user might only be running a specific pass, so we need to be careful that this isn't an error. If an inode is unreachable, that's handled by a separate pass. Also, add a new 'bch2_inode_has_backpointer()' helper, since we were doing this inconsistently. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 18 +++++++++++++----- fs/bcachefs/inode.h | 5 +++++ fs/bcachefs/namei.c | 3 +-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 73cff24598ed..57ddc20a5cce 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -327,7 +327,8 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) (inode->bi_flags & BCH_INODE_has_child_snapshot)) return false; - return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); + return !bch2_inode_has_backpointer(inode) && + !(inode->bi_flags & BCH_INODE_unlinked); } static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) @@ -514,7 +515,7 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - if (!inode->bi_dir) + if (!bch2_inode_has_backpointer(inode)) return 0; u32 snapshot = inode->bi_snapshot; @@ -1165,13 +1166,14 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - if (u.bi_dir || u.bi_dir_offset) { + if (bch2_inode_has_backpointer(&u)) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) goto err; } - if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), + if (fsck_err_on(bch2_inode_has_backpointer(&u) && + (u.bi_flags & BCH_INODE_unlinked), trans, inode_unlinked_but_has_dirent, "inode unlinked but has dirent\n%s", (printbuf_reset(&buf), @@ -2751,7 +2753,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) if (ret) return ret; - while (!inode.bi_subvol) { + /* + * If we're running full fsck, check_dirents() will have already ran, + * and we shouldn't see any missing backpointers here - otherwise that's + * handled separately, by check_unreachable_inodes + */ + while (!inode.bi_subvol && + bch2_inode_has_backpointer(&inode)) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 82cec2836cbd..b8ec3e628d90 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -254,6 +254,11 @@ static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_ : c->opts.casefold; } +static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) +{ + return bi->bi_dir || bi->bi_dir_offset; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 7d1068aa998f..c3f87c59922d 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -734,8 +734,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, if (inode_points_to_dirent(target, d)) return 0; - if (!target->bi_dir && - !target->bi_dir_offset) { + if (!bch2_inode_has_backpointer(target)) { fsck_err_on(S_ISDIR(target->bi_mode), trans, inode_dir_missing_backpointer, "directory with missing backpointer\n%s", From 3f890768dab1f97ff9bd7ebb76f4c52309401501 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 16:41:43 -0400 Subject: [PATCH 066/297] bcachefs: fsck: fix unhandled restart in topology repair Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e92cf3928c63..697c6ecc3a65 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -503,8 +503,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + /* + * XXX: we're not passing the trans object here because we're not set up + * to handle a transaction restart - this code needs to be rewritten + * when we start doing online topology repair + */ + bch2_trans_unlock_long(trans); if (mustfix_fsck_err_on(!have_child, - trans, btree_node_topology_interior_node_empty, + c, btree_node_topology_interior_node_empty, "empty interior btree node at %s", buf.buf)) ret = DROP_THIS_NODE; err: From 1df310860aa5b4d97b3cc83a1a8dee599071b72d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 16:49:40 -0400 Subject: [PATCH 067/297] bcachefs: fsck: Fix oops in key_visible_in_snapshot() The normal fsck code doesn't call key_visible_in_snapshot() with an empty list of snapshot IDs seen (the current snapshot ID will always be on the list), but str_hash_repair_key() -> bch2_get_snapshot_overwrites() can, and that's totally fine as long as we check for it. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 57ddc20a5cce..9920f1affc5b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -728,14 +728,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, u32 id, u32 ancestor) { - ssize_t i; - EBUG_ON(id > ancestor); - /* @ancestor should be the snapshot most recently added to @seen */ - EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != darray_last(seen->ids)); - if (id == ancestor) return true; @@ -751,11 +745,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see * numerically, since snapshot ID lists are kept sorted, so if we find * an id that's an ancestor of @id we're done: */ - - for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i] >= id; - --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) + darray_for_each_reverse(seen->ids, i) + if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) return false; return true; From 88bd771191f7f20d6295700d1746f576419e3d1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 20:24:12 -0400 Subject: [PATCH 068/297] bcachefs: fix spurious error in read_btree_roots() Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index fa5d1ef5bea6..e0d824471bff 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -607,6 +607,7 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; + ret = 0; } } From 434635987fb7d544dd134f25c922413e19b02112 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jun 2025 20:22:32 -0400 Subject: [PATCH 069/297] bcachefs: Fix missing newlines before ero Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 1 + fs/bcachefs/journal_io.c | 5 +++-- fs/bcachefs/recovery.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5f1174348974..e848e210a9bf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -249,6 +249,7 @@ static int data_update_invalid_bkey(struct data_update *m, bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); bch2_fs_emergency_read_only2(c, &buf); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index afbf12e8f0c5..dd3f3434c1b0 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1716,9 +1716,10 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_log_msg_start(c, &buf); if (err == -BCH_ERR_journal_write_err) - prt_printf(&buf, "unable to write journal to sufficient devices"); + prt_printf(&buf, "unable to write journal to sufficient devices\n"); else - prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); + prt_printf(&buf, "journal write error marking replicas: %s\n", + bch2_err_str(err)); bch2_fs_emergency_read_only2(c, &buf); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e0d824471bff..d0b7e3a36a54 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1142,7 +1142,7 @@ int bch2_fs_recovery(struct bch_fs *c) struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); - prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); + prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); bch2_fs_emergency_read_only2(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); From ba8dac350faf16afc129ce6303ca4feaf083ccb1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jun 2025 11:26:33 +0800 Subject: [PATCH 070/297] f2fs: fix to zero post-eof page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fstest reports a f2fs bug: generic/363 42s ... [failed, exit status 1]- output mismatch (see /share/git/fstests/results//generic/363.out.bad) --- tests/generic/363.out 2025-01-12 21:57:40.271440542 +0800 +++ /share/git/fstests/results//generic/363.out.bad 2025-05-19 19:55:58.000000000 +0800 @@ -1,2 +1,78 @@ QA output created by 363 fsx -q -S 0 -e 1 -N 100000 +READ BAD DATA: offset = 0xd6fb, size = 0xf044, fname = /mnt/f2fs/junk +OFFSET GOOD BAD RANGE +0x1540d 0x0000 0x2a25 0x0 +operation# (mod 256) for the bad data may be 37 +0x1540e 0x0000 0x2527 0x1 ... (Run 'diff -u /share/git/fstests/tests/generic/363.out /share/git/fstests/results//generic/363.out.bad' to see the entire diff) Ran: generic/363 Failures: generic/363 Failed 1 of 1 tests The root cause is user can update post-eof page via mmap [1], however, f2fs missed to zero post-eof page in below operations, so, once it expands i_size, then it will include dummy data locates previous post-eof page, so during below operations, we need to zero post-eof page. Operations which can include dummy data after previous i_size after expanding i_size: - write - mapwrite [1] - truncate - fallocate * preallocate * zero_range * insert_range * collapse_range - clone_range (doesn’t support in f2fs) - copy_range (doesn’t support in f2fs) [1] https://man7.org/linux/man-pages/man2/mmap.2.html 'BUG section' Cc: stable@kernel.org Signed-off-by: Chao Yu Reviewed-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..696131e655ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,6 +35,17 @@ #include #include +static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_pagecache(inode, old_size); +} + static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); + filemap_invalidate_unlock(inode->i_mapping); + file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); + folio_lock(folio); if (unlikely(folio->mapping != inode->i_mapping || folio_pos(folio) > i_size_read(inode) || @@ -1109,6 +1125,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_down_write(&fi->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1227,6 +1245,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1510,6 +1532,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); @@ -1631,6 +1655,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + filemap_invalidate_lock(mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1762,6 +1790,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1819,6 +1849,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; @@ -4860,6 +4894,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) err = file_modified(file); if (err) return err; + + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); + filemap_invalidate_unlock(inode->i_mapping); return count; } From d4adf1c9ee7722545450608bcb095fb31512f0c6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 18 Jun 2025 17:57:40 -0400 Subject: [PATCH 071/297] bpf: Adjust free target to avoid global starvation of LRU map BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/map_hash.rst | 8 ++- Documentation/bpf/map_lru_hash_update.dot | 6 +- kernel/bpf/bpf_lru_list.c | 9 ++- kernel/bpf/bpf_lru_list.h | 1 + tools/testing/selftests/bpf/test_lru_map.c | 72 +++++++++++----------- 5 files changed, 52 insertions(+), 44 deletions(-) diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst index d2343952f2cb..8606bf958a8c 100644 --- a/Documentation/bpf/map_hash.rst +++ b/Documentation/bpf/map_hash.rst @@ -233,10 +233,16 @@ attempts in order to enforce the LRU property which have increasing impacts on other CPUs involved in the following operation attempts: - Attempt to use CPU-local state to batch operations -- Attempt to fetch free nodes from global lists +- Attempt to fetch ``target_free`` free nodes from global lists - Attempt to pull any node from a global list and remove it from the hashmap - Attempt to pull any node from any CPU's list and remove it from the hashmap +The number of nodes to borrow from the global list in a batch, ``target_free``, +depends on the size of the map. Larger batch size reduces lock contention, but +may also exhaust the global structure. The value is computed at map init to +avoid exhaustion, by limiting aggregate reservation by all CPUs to half the map +size. With a minimum of a single element and maximum budget of 128 at a time. + This algorithm is described visually in the following diagram. See the description in commit 3a08c2fd7634 ("bpf: LRU List") for a full explanation of the corresponding operations: diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot index a0fee349d29c..ab10058f5b79 100644 --- a/Documentation/bpf/map_lru_hash_update.dot +++ b/Documentation/bpf/map_lru_hash_update.dot @@ -35,18 +35,18 @@ digraph { fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2, label="Flush local pending, Rotate Global list, move - LOCAL_FREE_TARGET + target_free from global -> local"] // Also corresponds to: // fn__local_list_flush() // fn_bpf_lru_list_rotate() fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2, - label="Able to free\nLOCAL_FREE_TARGET\nnodes?"] + label="Able to free\ntarget_free\nnodes?"] fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3, label="Shrink inactive list up to remaining - LOCAL_FREE_TARGET + target_free (global LRU -> local)"] fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2, label="> 0 entries in\nlocal free list?"] diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index fda7589c5023..4ae83f4b7fc7 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try) return ret; } +/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */ +static unsigned int __map_size(unsigned int tgt_free) +{ + return tgt_free * nr_cpus * 2; +} + /* Size of the LRU map is 2 * Add key=1 (+1 key) * Add key=2 (+1 key) @@ -231,11 +237,11 @@ static void test_lru_sanity0(int map_type, int map_flags) printf("Pass\n"); } -/* Size of the LRU map is 1.5*tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Lookup 1 to tgt_free/2 - * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys) - * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU +/* Verify that unreferenced elements are recycled before referenced ones. + * Insert elements. + * Reference a subset of these. + * Insert more, enough to trigger recycling. + * Verify that unreferenced are recycled. */ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) { @@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -266,13 +272,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Lookup 1 to tgt_free/2 */ + /* Lookup 1 to batch_size */ end_key = 1 + batch_size; for (key = 1; key < end_key; key++) { assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value)); @@ -280,12 +286,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) BPF_NOEXIST)); } - /* Insert 1+tgt_free to 2*tgt_free - * => 1+tgt_free/2 to LOCALFREE_TARGET will be + /* Insert another map_size - batch_size keys + * Map will contain 1 to batch_size plus these latest, i.e., + * => previous 1+batch_size to map_size - batch_size will have been * removed by LRU */ - key = 1 + tgt_free; - end_key = key + tgt_free; + key = 1 + __map_size(tgt_free); + end_key = key + __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -301,17 +308,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) printf("Pass\n"); } -/* Size of the LRU map 1.5 * tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Update 1 to tgt_free/2 - * => The original 1 to tgt_free/2 will be removed due to - * the LRU shrink process - * Re-insert 1 to tgt_free/2 again and do a lookup immeidately - * Insert 1+tgt_free to tgt_free*3/2 - * Insert 1+tgt_free*3/2 to tgt_free*5/2 - * => Key 1+tgt_free to tgt_free*3/2 - * will be removed from LRU because it has never - * been lookup and ref bit is not set +/* Verify that insertions exceeding map size will recycle the oldest. + * Verify that unreferenced elements are recycled before referenced. */ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) { @@ -334,7 +332,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -343,8 +341,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -357,8 +355,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) * shrink the inactive list to get tgt_free * number of free nodes. * - * Hence, the oldest key 1 to tgt_free/2 - * are removed from the LRU list. + * Hence, the oldest key is removed from the LRU list. */ key = 1; if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { @@ -370,8 +367,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) BPF_EXIST)); } - /* Re-insert 1 to tgt_free/2 again and do a lookup - * immeidately. + /* Re-insert 1 to batch_size again and do a lookup immediately. */ end_key = 1 + batch_size; value[0] = 4321; @@ -387,17 +383,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1+tgt_free to tgt_free*3/2 */ - end_key = 1 + tgt_free + batch_size; - for (key = 1 + tgt_free; key < end_key; key++) + /* Insert batch_size new elements */ + key = 1 + __map_size(tgt_free); + end_key = key + batch_size; + for (; key < end_key; key++) /* These newly added but not referenced keys will be * gone during the next LRU shrink. */ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */ - end_key = key + tgt_free; + /* Insert map_size - batch_size elements */ + end_key += __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -500,7 +497,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free) lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free * nr_cpus); else - lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free); + lru_map_fd = create_map(map_type, map_flags, + 3 * __map_size(tgt_free)); assert(lru_map_fd != -1); expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, From ff78538e07fa284ce08cbbcb0730daa91ed16722 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 10 Jun 2025 21:41:44 -0400 Subject: [PATCH 072/297] vt: add missing notification when switching back to text mode Programs using poll() on /dev/vcsa to be notified when VT changes occur were missing one case: the switch from gfx to text mode. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/9o5ro928-0pp4-05rq-70p4-ro385n21n723@onlyvoer.pbz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index ed39d9cb4432..62049ceb34de 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -4650,6 +4650,7 @@ void do_unblank_screen(int leaving_gfx) set_palette(vc); set_cursor(vc); vt_event_post(VT_EVENT_UNBLANK, vc->vc_num, vc->vc_num); + notify_update(vc); } EXPORT_SYMBOL(do_unblank_screen); From 747b52413effe958ed57cf6d7bef80c34e1185f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Jun 2025 19:02:29 -0700 Subject: [PATCH 073/297] vt: fix kernel-doc warnings in ucs_get_fallback() Use the correct function parameter name in ucs_get_fallback() to prevent kernel-doc warnings: Warning: drivers/tty/vt/ucs.c:218 function parameter 'cp' not described in 'ucs_get_fallback' Warning: drivers/tty/vt/ucs.c:218 Excess function parameter 'base' description in 'ucs_get_fallback' Fixes: fe26933cf1e1 ("vt: add ucs_get_fallback()") Signed-off-by: Randy Dunlap Cc: Nicolas Pitre Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-serial@vger.kernel.org Reviewed-by: Nicolas Pitre . Link: https://lore.kernel.org/r/20250611020229.2650595-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/ucs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/vt/ucs.c b/drivers/tty/vt/ucs.c index 6ead622b7713..03877485dfb7 100644 --- a/drivers/tty/vt/ucs.c +++ b/drivers/tty/vt/ucs.c @@ -206,7 +206,7 @@ static int ucs_page_entry_cmp(const void *key, const void *element) /** * ucs_get_fallback() - Get a substitution for the provided Unicode character - * @base: Base Unicode code point (UCS-4) + * @cp: Unicode code point (UCS-4) * * Get a simpler fallback character for the provided Unicode character. * This is used for terminal display when corresponding glyph is unavailable. From d36f0e9a0002f04f4d6dd9be908d58fe5bd3a279 Mon Sep 17 00:00:00 2001 From: Aidan Stewart Date: Tue, 17 Jun 2025 10:48:19 -0600 Subject: [PATCH 074/297] serial: core: restore of_node information in sysfs Since in v6.8-rc1, the of_node symlink under tty devices is missing. This breaks any udev rules relying on this information. Link the of_node information in the serial controller device with the parent defined in the device tree. This will also apply to the serial device which takes the serial controller as a parent device. Fixes: b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") Cc: stable@vger.kernel.org Signed-off-by: Aidan Stewart Link: https://lore.kernel.org/r/20250617164819.13912-1-astewart@tektelic.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base_bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 5d1677f1b651..cb3b127b06b6 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -72,6 +72,7 @@ static int serial_base_device_init(struct uart_port *port, dev->parent = parent_dev; dev->bus = &serial_base_bus_type; dev->release = release; + device_set_of_node_from_dev(dev, parent_dev); if (!serial_base_initialized) { dev_dbg(port->dev, "uart_add_one_port() called before arch_initcall()?\n"); From c769be2d3dbb165a3d432f4fcca2c80fabb35877 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 22 May 2025 16:10:00 +0100 Subject: [PATCH 075/297] btrfs: include root in error message when unlinking inode To help debugging include the root number in the error message, and since this is a critical error that implies a metadata inconsistency and results in a transaction abort change the log message level from "info" to "critical", which is a much better fit. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c778243bf1..a62f92ab5977 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } From dd276214e439db08f444fd3e07e9fe4c9e0ca210 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 27 May 2025 17:04:21 -0700 Subject: [PATCH 076/297] btrfs: fix delayed ref refcount leak in debug assertion If the delayed_root is not empty we are increasing the number of references to a delayed_node without decreasing it, causing a leak. Fix by decrementing the delayed_node reference count. Reviewed-by: Filipe Manana Signed-off-by: Leo Martins Reviewed-by: Qu Wenruo [ Remove the changelog from the commit message. ] Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7cc24a5dd5e..8c597fa60523 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) From 186b9dc3c302ad706b3f23c857eb128165f6b484 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 27 May 2025 17:04:22 -0700 Subject: [PATCH 077/297] btrfs: warn if leaking delayed_nodes in btrfs_put_root() Add a warning for leaked delayed_nodes when putting a root. We currently do this for inodes, but not delayed_nodes. Signed-off-by: Leo Martins Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo [ Remove the changelog from the commit message. ] Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1beb9458f622..3def93016963 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); From 65d5112b4d7cf019ccb62cf40077038aae66239b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 20 May 2025 19:42:23 +0800 Subject: [PATCH 078/297] btrfs: scrub: add prefix for the error messages Add a "scrub: " prefix to all messages logged by scrub so that it's easy to filter them from dmesg for analysis. Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/scrub.c | 53 ++++++++++++++++++++++++------------------------ 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a498fe524c90..1e8f7082239c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,7 +3142,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed120621021b..94618add3b44 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -557,7 +557,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -571,7 +571,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; @@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; @@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); @@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -1046,12 +1046,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (repaired) { if (dev) { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -1060,12 +1060,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* The remaining are all for unrepaired. */ if (dev) { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, int ret; if (unlikely(!extent_root || !csum_root)) { - btrfs_err(fs_info, "no valid extent or csum root for scrub"); + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; @@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2898,13 +2897,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -3065,7 +3064,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; From 3ca864de852bc91007b32d2a0d48993724f4abad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 28 May 2025 12:28:27 +0100 Subject: [PATCH 079/297] btrfs: fix a race between renames and directory logging We have a race between a rename and directory inode logging that if it happens and we crash/power fail before the rename completes, the next time the filesystem is mounted, the log replay code will end up deleting the file that was being renamed. This is best explained following a step by step analysis of an interleaving of steps that lead into this situation. Consider the initial conditions: 1) We are at transaction N; 2) We have directories A and B created in a past transaction (< N); 3) We have inode X corresponding to a file that has 2 hardlinks, one in directory A and the other in directory B, so we'll name them as "A/foo_link1" and "B/foo_link2". Both hard links were persisted in a past transaction (< N); 4) We have inode Y corresponding to a file that as a single hard link and is located in directory A, we'll name it as "A/bar". This file was also persisted in a past transaction (< N). The steps leading to a file loss are the following and for all of them we are under transaction N: 1) Link "A/foo_link1" is removed, so inode's X last_unlink_trans field is updated to N, through btrfs_unlink() -> btrfs_record_unlink_dir(); 2) Task A starts a rename for inode Y, with the goal of renaming from "A/bar" to "A/baz", so we enter btrfs_rename(); 3) Task A inserts the new BTRFS_INODE_REF_KEY for inode Y by calling btrfs_insert_inode_ref(); 4) Because the rename happens in the same directory, we don't set the last_unlink_trans field of directoty A's inode to the current transaction id, that is, we don't cal btrfs_record_unlink_dir(); 5) Task A then removes the entries from directory A (BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY items) when calling __btrfs_unlink_inode() (actually the dir index item is added as a delayed item, but the effect is the same); 6) Now before task A adds the new entry "A/baz" to directory A by calling btrfs_add_link(), another task, task B is logging inode X; 7) Task B starts a fsync of inode X and after logging inode X, at btrfs_log_inode_parent() it calls btrfs_log_all_parents(), since inode X has a last_unlink_trans value of N, set at in step 1; 8) At btrfs_log_all_parents() we search for all parent directories of inode X using the commit root, so we find directories A and B and log them. Bu when logging direct A, we don't have a dir index item for inode Y anymore, neither the old name "A/bar" nor for the new name "A/baz" since the rename has deleted the old name but has not yet inserted the new name - task A hasn't called yet btrfs_add_link() to do that. Note that logging directory A doesn't fallback to a transaction commit because its last_unlink_trans has a lower value than the current transaction's id (see step 4); 9) Task B finishes logging directories A and B and gets back to btrfs_sync_file() where it calls btrfs_sync_log() to persist the log tree; 10) Task B successfully persisted the log tree, btrfs_sync_log() completed with success, and a power failure happened. We have a log tree without any directory entry for inode Y, so the log replay code deletes the entry for inode Y, name "A/bar", from the subvolume tree since it doesn't exist in the log tree and the log tree is authorative for its index (we logged a BTRFS_DIR_LOG_INDEX_KEY item that covers the index range for the dentry that corresponds to "A/bar"). Since there's no other hard link for inode Y and the log replay code deletes the name "A/bar", the file is lost. The issue wouldn't happen if task B synced the log only after task A called btrfs_log_new_name(), which would update the log with the new name for inode Y ("A/bar"). Fix this by pinning the log root during renames before removing the old directory entry, and unpinning after btrfs_log_new_name() is called. Fixes: 259c4b96d78d ("btrfs: stop doing unnecessary log updates during a rename") CC: stable@vger.kernel.org # 5.18+ Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 83 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a62f92ab5977..26d6ed170a19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_end_log_trans(dest); out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: From ae4477f937569d097ca5dbce92a89ba384b49bc6 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 29 May 2025 10:37:44 +0100 Subject: [PATCH 080/297] btrfs: update superblock's device bytes_used when dropping chunk Each superblock contains a copy of the device item for that device. In a transaction which drops a chunk but doesn't create any new ones, we were correctly updating the device item in the chunk tree but not copying over the new bytes_used value to the superblock. This can be seen by doing the following: # dd if=/dev/zero of=test bs=4096 count=2621440 # mkfs.btrfs test # mount test /root/temp # cd /root/temp # for i in {00..10}; do dd if=/dev/zero of=$i bs=4096 count=32768; done # sync # rm * # sync # btrfs balance start -dusage=0 . # sync # cd # umount /root/temp # btrfs check test For btrfs-check to detect this, you will also need my patch at https://github.com/kdave/btrfs-progs/pull/991. Change btrfs_remove_dev_extents() so that it adds the devices to the fs_info->post_commit_list if they're not there already. This causes btrfs_commit_device_sizes() to be called, which updates the bytes_used value in the superblock. Fixes: bbbf7243d62d ("btrfs: combine device update operations during transaction commit") CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89835071cfea..f475b4b7c457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3282,6 +3282,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } From e5b5596011773a38e035e9633ed928ef13c720b1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 19:45:35 +0100 Subject: [PATCH 081/297] btrfs: fix double unlock of buffer_tree xarray when releasing subpage eb If we break out of the loop because an extent buffer doesn't have the bit EXTENT_BUFFER_TREE_REF set, we end up unlocking the xarray twice, once before we tested for the bit and break out of the loop, and once again after the loop. Fix this by testing the bit and exiting before unlocking the xarray. The time spent testing the bit is negligible and it's not worth trying to do that outside the critical section delimited by the xarray lock due to the code complexity required to avoid it (like using a local boolean variable to track whether the xarray is locked or not). The xarray unlock only needs to be done before calling release_extent_buffer(), as that needs to lock the xarray (through xa_cmpxchg_irq()) and does a more significant amount of work. Fixes: 19d7f65f032f ("btrfs: convert the buffer_radix to an xarray") Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-btrfs/aDRNDU0GM1_D4Xnw@stanley.mountain/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 849199768664..1dc931c4937f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_unlock(&eb->refs_lock); continue; } - xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); xa_lock_irq(&fs_info->buffer_tree); } From 2dcf838cf5c2f0f4501edaa1680fcad03618d760 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 19:29:01 +0100 Subject: [PATCH 082/297] btrfs: fix invalid inode pointer dereferences during log replay In a few places where we call read_one_inode(), if we get a NULL pointer we end up jumping into an error path, or fallthrough in case of __add_inode_ref(), where we then do something like this: iput(&inode->vfs_inode); which results in an invalid inode pointer that triggers an invalid memory access, resulting in a crash. Fix this by making sure we don't do such dereferences. Fixes: b4c50cbb01a1 ("btrfs: return a btrfs_inode from read_one_inode()") CC: stable@vger.kernel.org # 6.15+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 97e933113b82..21d2f3dded51 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -668,15 +668,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + return 0; } inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + if (!inode) + return -EIO; /* * first check to see if we already have this extent in the @@ -961,7 +958,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(&inode->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1176,8 +1174,8 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; From 16edae52f60658caeaa0702e7cb6b738a09d4ad4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 22:06:17 +0100 Subject: [PATCH 083/297] btrfs: don't silently ignore unexpected extent type when replaying log If there's an unexpected (invalid) extent type, we just silently ignore it. This means a corruption or some bug somewhere, so instead return -EUCLEAN to the caller, making log replay fail, and print an error message with relevant information. Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 21d2f3dded51..858b609e292c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -668,7 +668,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - return 0; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } inode = read_one_inode(root, key->objectid); From 1961d20f6fa8903266ed9bd77c691924c22c8f02 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 Jun 2025 20:51:03 +0100 Subject: [PATCH 084/297] btrfs: fix assertion when building free space tree When building the free space tree with the block group tree feature enabled, we can hit an assertion failure like this: BTRFS info (device loop0 state M): rebuilding free space tree assertion failed: ret == 0, in fs/btrfs/free-space-tree.c:1102 ------------[ cut here ]------------ kernel BUG at fs/btrfs/free-space-tree.c:1102! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP Modules linked in: CPU: 1 UID: 0 PID: 6592 Comm: syz-executor322 Not tainted 6.15.0-rc7-syzkaller-gd7fa1af5b33e #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 lr : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 sp : ffff8000a4ce7600 x29: ffff8000a4ce76e0 x28: ffff0000c9bc6000 x27: ffff0000ddfff3d8 x26: ffff0000ddfff378 x25: dfff800000000000 x24: 0000000000000001 x23: ffff8000a4ce7660 x22: ffff70001499cecc x21: ffff0000e1d8c160 x20: ffff0000e1cb7800 x19: ffff0000e1d8c0b0 x18: 00000000ffffffff x17: ffff800092f39000 x16: ffff80008ad27e48 x15: ffff700011e740c0 x14: 1ffff00011e740c0 x13: 0000000000000004 x12: ffffffffffffffff x11: ffff700011e740c0 x10: 0000000000ff0100 x9 : 94ef24f55d2dbc00 x8 : 94ef24f55d2dbc00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000a4ce6f98 x4 : ffff80008f415ba0 x3 : ffff800080548ef0 x2 : 0000000000000000 x1 : 0000000100000000 x0 : 000000000000003e Call trace: populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 (P) btrfs_rebuild_free_space_tree+0x14c/0x54c fs/btrfs/free-space-tree.c:1337 btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Code: f0047182 91178042 528089c3 9771d47b (d4210000) ---[ end trace 0000000000000000 ]--- This happens because we are processing an empty block group, which has no extents allocated from it, there are no items for this block group, including the block group item since block group items are stored in a dedicated tree when using the block group tree feature. It also means this is the block group with the highest start offset, so there are no higher keys in the extent root, hence btrfs_search_slot_for_read() returns 1 (no higher key found). Fix this by asserting 'ret' is 0 only if the block group tree feature is not enabled, in which case we should find a block group item for the block group since it's stored in the extent root and block group item keys are greater than extent item keys (the value for BTRFS_BLOCK_GROUP_ITEM_KEY is 192 and for BTRFS_EXTENT_ITEM_KEY and BTRFS_METADATA_ITEM_KEY the values are 168 and 169 respectively). In case 'ret' is 1, we just need to add a record to the free space tree which spans the whole block group, and we can achieve this by making 'ret == 0' as the while loop's condition. Reported-by: syzbot+36fae25c35159a763a2a@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6841dca8.a00a0220.d4325.0020.GAE@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0c573d46639a..a3e2a2a81461 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, From a26bf338cdad3643a6e7c3d78a172baadba15c1a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Jun 2025 16:54:44 +0100 Subject: [PATCH 085/297] btrfs: fix race between async reclaim worker and close_ctree() Syzbot reported an assertion failure due to an attempt to add a delayed iput after we have set BTRFS_FS_STATE_NO_DELAYED_IPUT in the fs_info state: WARNING: CPU: 0 PID: 65 at fs/btrfs/inode.c:3420 btrfs_add_delayed_iput+0x2f8/0x370 fs/btrfs/inode.c:3420 Modules linked in: CPU: 0 UID: 0 PID: 65 Comm: kworker/u8:4 Not tainted 6.15.0-next-20250530-syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Workqueue: btrfs-endio-write btrfs_work_helper RIP: 0010:btrfs_add_delayed_iput+0x2f8/0x370 fs/btrfs/inode.c:3420 Code: 4e ad 5d (...) RSP: 0018:ffffc9000213f780 EFLAGS: 00010293 RAX: ffffffff83c635b7 RBX: ffff888058920000 RCX: ffff88801c769e00 RDX: 0000000000000000 RSI: 0000000000000100 RDI: 0000000000000000 RBP: 0000000000000001 R08: ffff888058921b67 R09: 1ffff1100b12436c R10: dffffc0000000000 R11: ffffed100b12436d R12: 0000000000000001 R13: dffffc0000000000 R14: ffff88807d748000 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffff888125c53000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00002000000bd038 CR3: 000000006a142000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_put_ordered_extent+0x19f/0x470 fs/btrfs/ordered-data.c:635 btrfs_finish_one_ordered+0x11d8/0x1b10 fs/btrfs/inode.c:3312 btrfs_work_helper+0x399/0xc20 fs/btrfs/async-thread.c:312 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 This can happen due to a race with the async reclaim worker like this: 1) The async metadata reclaim worker enters shrink_delalloc(), which calls btrfs_start_delalloc_roots() with an nr_pages argument that has a value less than LONG_MAX, and that in turn enters start_delalloc_inodes(), which sets the local variable 'full_flush' to false because wbc->nr_to_write is less than LONG_MAX; 2) There it finds inode X in a root's delalloc list, grabs a reference for inode X (with igrab()), and triggers writeback for it with filemap_fdatawrite_wbc(), which creates an ordered extent for inode X; 3) The unmount sequence starts from another task, we enter close_ctree() and we flush the workqueue fs_info->endio_write_workers, which waits for the ordered extent for inode X to complete and when dropping the last reference of the ordered extent, with btrfs_put_ordered_extent(), when we call btrfs_add_delayed_iput() we don't add the inode to the list of delayed iputs because it has a refcount of 2, so we decrement it to 1 and return; 4) Shortly after at close_ctree() we call btrfs_run_delayed_iputs() which runs all delayed iputs, and then we set BTRFS_FS_STATE_NO_DELAYED_IPUT in the fs_info state; 5) The async reclaim worker, after calling filemap_fdatawrite_wbc(), now calls btrfs_add_delayed_iput() for inode X and there we trigger an assertion failure since the fs_info state has the flag BTRFS_FS_STATE_NO_DELAYED_IPUT set. Fix this by setting BTRFS_FS_STATE_NO_DELAYED_IPUT only after we wait for the async reclaim workers to finish, after we call cancel_work_sync() for them at close_ctree(), and by running delayed iputs after wait for the reclaim workers to finish and before setting the bit. This race was recently introduced by commit 19e60b2a95f5 ("btrfs: add extra warning if delayed iput is added when it's not allowed"). Without the new validation at btrfs_add_delayed_iput(), this described scenario was safe because close_ctree() later calls btrfs_commit_super(). That will run any final delayed iputs added by reclaim workers in the window between the btrfs_run_delayed_iputs() and the the reclaim workers being shut down. Reported-by: syzbot+0ed30ad435bf6f5b7a42@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6840481c.a00a0220.d4325.000c.GAE@google.com/T/#u Fixes: 19e60b2a95f5 ("btrfs: add extra warning if delayed iput is added when it's not allowed") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3def93016963..f48f9d924a62 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4312,8 +4312,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4324,15 +4324,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); - /* There should be no more workload to generate new delayed iputs. */ - set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); From 547e836661554dcfa15c212a3821664e85b4191a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 7 Jun 2025 09:18:43 +0930 Subject: [PATCH 086/297] btrfs: handle csum tree error with rescue=ibadroots correctly [BUG] There is syzbot based reproducer that can crash the kernel, with the following call trace: (With some debug output added) DEBUG: rescue=ibadroots parsed BTRFS: device fsid 14d642db-7b15-43e4-81e6-4b8fac6a25f8 devid 1 transid 8 /dev/loop0 (7:0) scanned by repro (1010) BTRFS info (device loop0): first mount of filesystem 14d642db-7b15-43e4-81e6-4b8fac6a25f8 BTRFS info (device loop0): using blake2b (blake2b-256-generic) checksum algorithm BTRFS info (device loop0): using free-space-tree BTRFS warning (device loop0): checksum verify failed on logical 5312512 mirror 1 wanted 0xb043382657aede36608fd3386d6b001692ff406164733d94e2d9a180412c6003 found 0x810ceb2bacb7f0f9eb2bf3b2b15c02af867cb35ad450898169f3b1f0bd818651 level 0 DEBUG: read tree root path failed for tree csum, ret=-5 BTRFS warning (device loop0): checksum verify failed on logical 5328896 mirror 1 wanted 0x51be4e8b303da58e6340226815b70e3a93592dac3f30dd510c7517454de8567a found 0x51be4e8b303da58e634022a315b70e3a93592dac3f30dd510c7517454de8567a level 0 BTRFS warning (device loop0): checksum verify failed on logical 5292032 mirror 1 wanted 0x1924ccd683be9efc2fa98582ef58760e3848e9043db8649ee382681e220cdee4 found 0x0cb6184f6e8799d9f8cb335dccd1d1832da1071d12290dab3b85b587ecacca6e level 0 process 'repro' launched './file2' with NULL argv: empty string added DEBUG: no csum root, idatacsums=0 ibadroots=134217728 Oops: general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f] CPU: 5 UID: 0 PID: 1010 Comm: repro Tainted: G OE 6.15.0-custom+ #249 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:btrfs_lookup_csum+0x93/0x3d0 [btrfs] Call Trace: btrfs_lookup_bio_sums+0x47a/0xdf0 [btrfs] btrfs_submit_bbio+0x43e/0x1a80 [btrfs] submit_one_bio+0xde/0x160 [btrfs] btrfs_readahead+0x498/0x6a0 [btrfs] read_pages+0x1c3/0xb20 page_cache_ra_order+0x4b5/0xc20 filemap_get_pages+0x2d3/0x19e0 filemap_read+0x314/0xde0 __kernel_read+0x35b/0x900 bprm_execve+0x62e/0x1140 do_execveat_common.isra.0+0x3fc/0x520 __x64_sys_execveat+0xdc/0x130 do_syscall_64+0x54/0x1d0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- [CAUSE] Firstly the fs has a corrupted csum tree root, thus to mount the fs we have to go "ro,rescue=ibadroots" mount option. Normally with that mount option, a bad csum tree root should set BTRFS_FS_STATE_NO_DATA_CSUMS flag, so that any future data read will ignore csum search. But in this particular case, we have the following call trace that caused NULL csum root, but not setting BTRFS_FS_STATE_NO_DATA_CSUMS: load_global_roots_objectid(): ret = btrfs_search_slot(); /* Succeeded */ btrfs_item_key_to_cpu() found = true; /* We found the root item for csum tree. */ root = read_tree_root_path(); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) /* * Since we have rescue=ibadroots mount option, * @ret is still 0. */ break; if (!found || ret) { /* @found is true, @ret is 0, error handling for csum * tree is skipped. */ } This means we completely skipped to set BTRFS_FS_STATE_NO_DATA_CSUMS if the csum tree is corrupted, which results unexpected later csum lookup. [FIX] If read_tree_root_path() failed, always populate @ret to the error number. As at the end of the function, we need @ret to determine if we need to do the extra error handling for csum tree. Fixes: abed4aaae4f7 ("btrfs: track the csum, extent, and free space trees in a rb tree") Reported-by: Zhiyu Zhang Reported-by: Longxing Li Reviewed-by: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f48f9d924a62..0d6ad7512f21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2158,8 +2158,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); From c0d90a79e8e65b89037508276b2b31f41a1b3783 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 6 Jun 2025 09:17:41 +0200 Subject: [PATCH 087/297] btrfs: zoned: fix alloc_offset calculation for partly conventional block groups When one of two zones composing a DUP block group is a conventional zone, we have the zone_info[i]->alloc_offset = WP_CONVENTIONAL. That will, of course, not match the write pointer of the other zone, and fails that block group. This commit solves that issue by properly recovering the emulated write pointer from the last allocated extent. The offset for the SINGLE, DUP, and RAID1 are straight-forward: it is same as the end of last allocated extent. The RAID0 and RAID10 are a bit tricky that we need to do the math of striping. This is the kernel equivalent of Naohiro's user-space commit: "btrfs-progs: zoned: fix alloc_offset calculation for partly conventional block groups". Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 86 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5b0156d5b95..9430b34d3cbb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: From a55bc4ffc06d8c965a7d6f0a01ed0ed41380df28 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Jun 2025 14:13:14 -0700 Subject: [PATCH 088/297] staging: rtl8723bs: Avoid memset() in aes_cipher() and aes_decipher() After commit 6f110a5e4f99 ("Disable SLUB_TINY for build testing"), which causes CONFIG_KASAN to be enabled in allmodconfig again, arm64 allmodconfig builds with older versions of clang (15 through 17) show an instance of -Wframe-larger-than (which breaks the build with CONFIG_WERROR=y): drivers/staging/rtl8723bs/core/rtw_security.c:1287:5: error: stack frame size (2208) exceeds limit (2048) in 'rtw_aes_decrypt' [-Werror,-Wframe-larger-than] 1287 | u32 rtw_aes_decrypt(struct adapter *padapter, u8 *precvframe) | ^ This comes from aes_decipher() being inlined in rtw_aes_decrypt(). Running the same build with CONFIG_FRAME_WARN=128 shows aes_cipher() also uses a decent amount of stack, just under the limit of 2048: drivers/staging/rtl8723bs/core/rtw_security.c:864:19: warning: stack frame size (1952) exceeds limit (128) in 'aes_cipher' [-Wframe-larger-than] 864 | static signed int aes_cipher(u8 *key, uint hdrlen, | ^ -Rpass-analysis=stack-frame-layout only shows one large structure on the stack, which is the ctx variable inlined from aes128k128d(). A good number of the other variables come from the additional checks of fortified string routines, which are present in memset(), which both aes_cipher() and aes_decipher() use to initialize some temporary buffers. In this case, since the size is known at compile time, these additional checks should not result in any code generation changes but allmodconfig has several sanitizers enabled, which may make it harder for the compiler to eliminate the compile time checks and the variables that come about from them. The memset() calls are just initializing these buffers to zero, so use '= {}' instead, which is used all over the kernel and does the exact same thing as memset() without the fortify checks, which drops the stack usage of these functions by a few hundred kilobytes. drivers/staging/rtl8723bs/core/rtw_security.c:864:19: warning: stack frame size (1584) exceeds limit (128) in 'aes_cipher' [-Wframe-larger-than] 864 | static signed int aes_cipher(u8 *key, uint hdrlen, | ^ drivers/staging/rtl8723bs/core/rtw_security.c:1271:5: warning: stack frame size (1456) exceeds limit (128) in 'rtw_aes_decrypt' [-Wframe-larger-than] 1271 | u32 rtw_aes_decrypt(struct adapter *padapter, u8 *precvframe) | ^ Cc: stable@vger.kernel.org Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Nathan Chancellor Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20250609-rtl8723bs-fix-clang-arm64-wflt-v1-1-e2accba43def@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/core/rtw_security.c | 44 ++++++------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/drivers/staging/rtl8723bs/core/rtw_security.c b/drivers/staging/rtl8723bs/core/rtw_security.c index 1e9eff01b1aa..e9f382c280d9 100644 --- a/drivers/staging/rtl8723bs/core/rtw_security.c +++ b/drivers/staging/rtl8723bs/core/rtw_security.c @@ -868,29 +868,21 @@ static signed int aes_cipher(u8 *key, uint hdrlen, num_blocks, payload_index; u8 pn_vector[6]; - u8 mic_iv[16]; - u8 mic_header1[16]; - u8 mic_header2[16]; - u8 ctr_preload[16]; + u8 mic_iv[16] = {}; + u8 mic_header1[16] = {}; + u8 mic_header2[16] = {}; + u8 ctr_preload[16] = {}; /* Intermediate Buffers */ - u8 chain_buffer[16]; - u8 aes_out[16]; - u8 padded_buffer[16]; + u8 chain_buffer[16] = {}; + u8 aes_out[16] = {}; + u8 padded_buffer[16] = {}; u8 mic[8]; uint frtype = GetFrameType(pframe); uint frsubtype = GetFrameSubType(pframe); frsubtype = frsubtype>>4; - memset((void *)mic_iv, 0, 16); - memset((void *)mic_header1, 0, 16); - memset((void *)mic_header2, 0, 16); - memset((void *)ctr_preload, 0, 16); - memset((void *)chain_buffer, 0, 16); - memset((void *)aes_out, 0, 16); - memset((void *)padded_buffer, 0, 16); - if ((hdrlen == WLAN_HDR_A3_LEN) || (hdrlen == WLAN_HDR_A3_QOS_LEN)) a4_exists = 0; else @@ -1080,15 +1072,15 @@ static signed int aes_decipher(u8 *key, uint hdrlen, num_blocks, payload_index; signed int res = _SUCCESS; u8 pn_vector[6]; - u8 mic_iv[16]; - u8 mic_header1[16]; - u8 mic_header2[16]; - u8 ctr_preload[16]; + u8 mic_iv[16] = {}; + u8 mic_header1[16] = {}; + u8 mic_header2[16] = {}; + u8 ctr_preload[16] = {}; /* Intermediate Buffers */ - u8 chain_buffer[16]; - u8 aes_out[16]; - u8 padded_buffer[16]; + u8 chain_buffer[16] = {}; + u8 aes_out[16] = {}; + u8 padded_buffer[16] = {}; u8 mic[8]; uint frtype = GetFrameType(pframe); @@ -1096,14 +1088,6 @@ static signed int aes_decipher(u8 *key, uint hdrlen, frsubtype = frsubtype>>4; - memset((void *)mic_iv, 0, 16); - memset((void *)mic_header1, 0, 16); - memset((void *)mic_header2, 0, 16); - memset((void *)ctr_preload, 0, 16); - memset((void *)chain_buffer, 0, 16); - memset((void *)aes_out, 0, 16); - memset((void *)padded_buffer, 0, 16); - /* start to decrypt the payload */ num_blocks = (plen-8) / 16; /* plen including LLC, payload_length and mic) */ From b2348fe6c81cc92c7d0bd8a7d9241f8de0fa72b7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 12:25:41 -0400 Subject: [PATCH 089/297] bcachefs: Fix *__bch2_trans_subbuf_alloc() error path Don't change buf->size on error - this would usually be a transaction restart, but it could also be -ENOMEM - when we've exceeded the bump allocator max). Fixes: 247abee6ae6d ("bcachefs: btree_trans_subbuf") Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index e97e78c10f49..ee657b9f4b96 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -549,20 +549,26 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, unsigned u64s) { unsigned new_top = buf->u64s + u64s; - unsigned old_size = buf->size; + unsigned new_size = buf->size; - if (new_top > buf->size) - buf->size = roundup_pow_of_two(new_top); + BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); + if (new_top > new_size) + new_size = roundup_pow_of_two(new_top); + + void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); if (IS_ERR(n)) return n; + unsigned offset = (u64 *) n - (u64 *) trans->mem; + BUG_ON(offset > U16_MAX); + if (buf->u64s) memcpy(n, btree_trans_subbuf_base(trans, buf), - old_size * sizeof(u64)); + buf->size * sizeof(u64)); buf->base = (u64 *) n - (u64 *) trans->mem; + buf->size = new_size; void *p = btree_trans_subbuf_top(trans, buf); buf->u64s = new_top; From 32a01cd4334175a0ae42b3c3d7f37fd26468ecc3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 12:50:06 -0400 Subject: [PATCH 090/297] bcachefs: Don't log fsck err in the journal if doing repair elsewhere This fixes exceeding the bump allocator limit when the allocator finds many buckets that need repair - they're repaired asynchronously, which means that every error logged a message in the bump allocator, without committing. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 13 +++++++++---- fs/bcachefs/btree_iter.c | 4 ++++ fs/bcachefs/error.c | 4 +++- fs/bcachefs/sb-errors_format.h | 7 ++++--- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index b228a5a64479..66de46318620 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1406,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite : BCH_DATA_free; struct printbuf buf = PRINTBUF; + unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| + FSCK_CAN_FIX|FSCK_CAN_IGNORE; + struct bpos bucket = iter->pos; bucket.offset &= ~(~0ULL << 56); u64 genbits = iter->pos.offset & (~0ULL << 56); @@ -1419,9 +1422,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite return ret; if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) goto delete; ret = 1; goto out; @@ -1433,7 +1437,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite if (a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a))) { - if (fsck_err(trans, need_discard_freespace_key_bad, + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_bad, "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 061603999ae5..352f9cd2634f 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3194,6 +3194,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", + BTREE_TRANS_MEM_MAX); + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a8ec6aae5738..b2a6c041e165 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -621,7 +621,9 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; - if (trans) + if (trans && + !(flags & FSCK_ERR_NO_LOG) && + ret == -BCH_ERR_fsck_fix) ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; err_unlock: mutex_unlock(&c->fsck_error_msgs_lock); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index f1aa40542a0e..041887aad63a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -3,9 +3,10 @@ #define _BCACHEFS_SB_ERRORS_FORMAT_H enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_AUTOFIX = 1 << 2, + FSCK_CAN_FIX = BIT(0), + FSCK_CAN_IGNORE = BIT(1), + FSCK_AUTOFIX = BIT(2), + FSCK_ERR_NO_LOG = BIT(3), }; #define BCH_SB_ERRS() \ From b2e2bed119809a5ca384241e0631f04c6142ae08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 14:32:57 -0400 Subject: [PATCH 091/297] bcachefs: Add missing key type checks to check_snapshot_exists() For now we only have one key type in these btrees, but forward compatibility means we do have to check. Reported-by: syzbot+b4cb4a6988aced0cec4b@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 38aeaa128d27..4c43d2a2c1f5 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -871,7 +871,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + if (k.k->type == KEY_TYPE_snapshot_tree && + le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { tree_id = k.k->p.offset; break; } @@ -899,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + if (k.k->type == KEY_TYPE_subvolume && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { snapshot->v.subvol = cpu_to_le32(k.k->p.offset); SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); break; From fde46f60f6c5138ee422087addbc5bf5b4968bf1 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 10 Jun 2025 15:48:27 -0400 Subject: [PATCH 092/297] selinux: change security_compute_sid to return the ssid or tsid on match If the end result of a security_compute_sid() computation matches the ssid or tsid, return that SID rather than looking it up again. This avoids the problem of multiple initial SIDs that map to the same context. Cc: stable@vger.kernel.org Reported-by: Guido Trentalancia Fixes: ae254858ce07 ("selinux: introduce an initial SID for early boot processes") Signed-off-by: Stephen Smalley Tested-by: Guido Trentalancia Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 7becf3808818..d185754c2786 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1909,11 +1909,17 @@ static int security_compute_sid(u32 ssid, goto out_unlock; } /* Obtain the sid for the context. */ - rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); - if (rc == -ESTALE) { - rcu_read_unlock(); - context_destroy(&newcontext); - goto retry; + if (context_equal(scontext, &newcontext)) + *out_sid = ssid; + else if (context_equal(tcontext, &newcontext)) + *out_sid = tsid; + else { + rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); + if (rc == -ESTALE) { + rcu_read_unlock(); + context_destroy(&newcontext); + goto retry; + } } out_unlock: rcu_read_unlock(); From e0fca6f2cebff539e9317a15a37dcf432e3b851a Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Jun 2025 18:36:46 -0700 Subject: [PATCH 093/297] net: mana: Record doorbell physical address in PF mode MANA supports RDMA in PF mode. The driver should record the doorbell physical address when in PF mode. The doorbell physical address is used by the RDMA driver to map doorbell pages of the device to user-mode applications through RDMA verbs interface. In the past, they have been mapped to user-mode while the device is in VF mode. With the support for PF mode implemented, also expose those pages in PF mode. Support for PF mode is implemented in 290e5d3c49f6 ("net: mana: Add support for Multi Vports on Bare metal") Signed-off-by: Long Li Reviewed-by: Simon Horman Link: https://patch.msgid.link/1750210606-12167-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 3504507477c6..52cf7112762c 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -31,6 +31,9 @@ static void mana_gd_init_pf_regs(struct pci_dev *pdev) gc->db_page_base = gc->bar0_va + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + gc->phys_db_page_base = gc->bar0_pa + + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); sriov_base_va = gc->bar0_va + sriov_base_off; From 752eb816b55adb0673727ba0ed96609a17895654 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 4 Jun 2025 12:25:56 +0800 Subject: [PATCH 094/297] scsi: megaraid_sas: Fix invalid node index On a system with DRAM interleave enabled, out-of-bound access is detected: megaraid_sas 0000:3f:00.0: requested/available msix 128/128 poll_queue 0 ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in ./arch/x86/include/asm/topology.h:72:28 index -1 is out of range for type 'cpumask *[1024]' dump_stack_lvl+0x5d/0x80 ubsan_epilogue+0x5/0x2b __ubsan_handle_out_of_bounds.cold+0x46/0x4b megasas_alloc_irq_vectors+0x149/0x190 [megaraid_sas] megasas_probe_one.cold+0xa4d/0x189c [megaraid_sas] local_pci_probe+0x42/0x90 pci_device_probe+0xdc/0x290 really_probe+0xdb/0x340 __driver_probe_device+0x78/0x110 driver_probe_device+0x1f/0xa0 __driver_attach+0xba/0x1c0 bus_for_each_dev+0x8b/0xe0 bus_add_driver+0x142/0x220 driver_register+0x72/0xd0 megasas_init+0xdf/0xff0 [megaraid_sas] do_one_initcall+0x57/0x310 do_init_module+0x90/0x250 init_module_from_file+0x85/0xc0 idempotent_init_module+0x114/0x310 __x64_sys_finit_module+0x65/0xc0 do_syscall_64+0x82/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it accordingly. Signed-off-by: Chen Yu Link: https://lore.kernel.org/r/20250604042556.3731059-1-yu.c.chen@intel.com Fixes: 8049da6f3943 ("scsi: megaraid_sas: Use irq_set_affinity_and_hint()") Cc: stable@vger.kernel.org Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 3aac0e17cb00..9179f8aee964 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5910,7 +5910,11 @@ megasas_set_high_iops_queue_affinity_and_hint(struct megasas_instance *instance) const struct cpumask *mask; if (instance->perf_mode == MR_BALANCED_PERF_MODE) { - mask = cpumask_of_node(dev_to_node(&instance->pdev->dev)); + int nid = dev_to_node(&instance->pdev->dev); + + if (nid == NUMA_NO_NODE) + nid = 0; + mask = cpumask_of_node(nid); for (i = 0; i < instance->low_latency_index_start; i++) { irq = pci_irq_vector(instance->pdev, i); From 2e083cd802294693a5414e4557a183dd7e442e71 Mon Sep 17 00:00:00 2001 From: anvithdosapati Date: Mon, 16 Jun 2025 08:57:34 +0000 Subject: [PATCH 095/297] scsi: ufs: core: Fix clk scaling to be conditional in reset and restore In ufshcd_host_reset_and_restore(), scale up clocks only when clock scaling is supported. Without this change CPU latency is voted for 0 (ufshcd_pm_qos_update) during resume unconditionally. Signed-off-by: anvithdosapati Link: https://lore.kernel.org/r/20250616085734.2133581-1-anvithdosapati@google.com Fixes: a3cd5ec55f6c ("scsi: ufs: add load based scaling of UFS gear") Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index f62d89c8e580..50adfb8b335b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -7807,7 +7807,8 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) hba->silence_err_logs = false; /* scale up clocks to max frequency before full reinitialization */ - ufshcd_scale_clks(hba, ULONG_MAX, true); + if (ufshcd_is_clkscaling_supported(hba)) + ufshcd_scale_clks(hba, ULONG_MAX, true); err = ufshcd_hba_enable(hba); From a35b29bdedb4d2ae3160d4d6684a6f1ecd9ca7c2 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:28 -0700 Subject: [PATCH 096/297] scsi: fnic: Fix crash in fnic_wq_cmpl_handler when FDMI times out When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash. Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly. Tested by checking MDS for FDMI information. Tested by using instrumented driver to: - Drop PLOGI response - Drop RHBA response - Drop RPA response - Drop RHBA and RPA response - Drop PLOGI response + ABTS response - Drop RHBA response + ABTS response - Drop RPA response + ABTS response - Drop RHBA and RPA response + ABTS response for both of them Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Tested-by: Arun Easi Co-developed-by: Arun Easi Signed-off-by: Arun Easi Tested-by: Karan Tilak Kumar Cc: stable@vger.kernel.org Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-1-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 113 +++++++++++++++++++++++++--------- drivers/scsi/fnic/fnic.h | 2 +- drivers/scsi/fnic/fnic_fdls.h | 1 + 3 files changed, 87 insertions(+), 29 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index f8ab69c51dab..36b498ad55b4 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -763,47 +763,69 @@ static void fdls_send_fabric_abts(struct fnic_iport_s *iport) iport->fabric.timer_pending = 1; } -static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport, + uint16_t oxid) { - uint8_t *frame; + struct fc_frame_header *pfdmi_abts; uint8_t d_id[3]; + uint8_t *frame; struct fnic *fnic = iport->fnic; - struct fc_frame_header *pfabric_abts; - unsigned long fdmi_tov; - uint16_t oxid; - uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + - sizeof(struct fc_frame_header); frame = fdls_alloc_frame(iport); if (frame == NULL) { FNIC_FCS_DBG(KERN_ERR, fnic->host, fnic->fnic_num, "Failed to allocate frame to send FDMI ABTS"); - return; + return NULL; } - pfabric_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); + pfdmi_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); fdls_init_fabric_abts_frame(frame, iport); hton24(d_id, FC_FID_MGMT_SERV); - FNIC_STD_SET_D_ID(*pfabric_abts, d_id); + FNIC_STD_SET_D_ID(*pfdmi_abts, d_id); + FNIC_STD_SET_OX_ID(*pfdmi_abts, oxid); + + return frame; +} + +static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +{ + uint8_t *frame; + unsigned long fdmi_tov; + uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + + sizeof(struct fc_frame_header); if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) { - oxid = iport->active_oxid_fdmi_plogi; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_plogi); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } else { if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { - oxid = iport->active_oxid_fdmi_rhba; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rhba); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { - oxid = iport->active_oxid_fdmi_rpa; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rpa); + if (frame == NULL) { + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) + goto arm_timer; + else + return; + } + fnic_send_fcoe_frame(iport, frame, frame_size); } } +arm_timer: fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov); mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov)); iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING; @@ -2245,6 +2267,21 @@ void fdls_fabric_timer_callback(struct timer_list *t) spin_unlock_irqrestore(&fnic->fnic_lock, flags); } +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport) +{ + struct fnic *fnic = iport->fnic; + + iport->fabric.fdmi_pending = 0; + /* If max retries not exhausted, start over from fdmi plogi */ + if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { + iport->fabric.fdmi_retry++; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Retry FDMI PLOGI. FDMI retry: %d", + iport->fabric.fdmi_retry); + fdls_send_fdmi_plogi(iport); + } +} + void fdls_fdmi_timer_callback(struct timer_list *t) { struct fnic_fdls_fabric_s *fabric = timer_container_of(fabric, t, @@ -2291,14 +2328,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); - iport->fabric.fdmi_pending = 0; - /* If max retries not exhaused, start over from fdmi plogi */ - if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { - iport->fabric.fdmi_retry++; - FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "retry fdmi timer %d", iport->fabric.fdmi_retry); - fdls_send_fdmi_plogi(iport); - } + fdls_fdmi_retry_plogi(iport); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); spin_unlock_irqrestore(&fnic->fnic_lock, flags); @@ -3716,11 +3746,32 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, switch (FNIC_FRAME_TYPE(oxid)) { case FNIC_FRAME_TYPE_FDMI_PLOGI: fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi); + + iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING; + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; break; case FNIC_FRAME_TYPE_FDMI_RHBA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING; + + /* If RPA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba); break; case FNIC_FRAME_TYPE_FDMI_RPA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING; + + /* If RHBA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa); break; default: @@ -3730,10 +3781,16 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, break; } - timer_delete_sync(&iport->fabric.fdmi_timer); - iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; - - fdls_send_fdmi_plogi(iport); + /* + * Only if ABORT PENDING is off, delete the timer, and if no other + * operations are pending, retry FDMI. + * Otherwise, let the timer pop and take the appropriate action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) { + timer_delete_sync(&iport->fabric.fdmi_timer); + if (!iport->fabric.fdmi_pending) + fdls_fdmi_retry_plogi(iport); + } } static void diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h index 6c5f6046b1f5..86e293ce530d 100644 --- a/drivers/scsi/fnic/fnic.h +++ b/drivers/scsi/fnic/fnic.h @@ -30,7 +30,7 @@ #define DRV_NAME "fnic" #define DRV_DESCRIPTION "Cisco FCoE HBA Driver" -#define DRV_VERSION "1.8.0.0" +#define DRV_VERSION "1.8.0.1" #define PFX DRV_NAME ": " #define DFX DRV_NAME "%d: " diff --git a/drivers/scsi/fnic/fnic_fdls.h b/drivers/scsi/fnic/fnic_fdls.h index 8e610b65ad57..531d0b37e450 100644 --- a/drivers/scsi/fnic/fnic_fdls.h +++ b/drivers/scsi/fnic/fnic_fdls.h @@ -394,6 +394,7 @@ void fdls_send_tport_abts(struct fnic_iport_s *iport, bool fdls_delete_tport(struct fnic_iport_s *iport, struct fnic_tport_s *tport); void fdls_fdmi_timer_callback(struct timer_list *t); +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport); /* fnic_fcs.c */ void fnic_fdls_init(struct fnic *fnic, int usefip); From 74f46a0524f8d2f01dc7ca95bb5fc463a8603e72 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:29 -0700 Subject: [PATCH 097/297] scsi: fnic: Turn off FDMI ACTIVE flags on link down When the link goes down and comes up, FDMI requests are not sent out anymore. Fix bug by turning off FNIC_FDMI_ACTIVE when the link goes down. Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Tested-by: Karan Tilak Kumar Cc: stable@vger.kernel.org Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-2-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 9 ++++++--- drivers/scsi/fnic/fnic.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index 36b498ad55b4..fa9cf0b37d72 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -5029,9 +5029,12 @@ void fnic_fdls_link_down(struct fnic_iport_s *iport) fdls_delete_tport(iport, tport); } - if ((fnic_fdmi_support == 1) && (iport->fabric.fdmi_pending > 0)) { - timer_delete_sync(&iport->fabric.fdmi_timer); - iport->fabric.fdmi_pending = 0; + if (fnic_fdmi_support == 1) { + if (iport->fabric.fdmi_pending > 0) { + timer_delete_sync(&iport->fabric.fdmi_timer); + iport->fabric.fdmi_pending = 0; + } + iport->flags &= ~FNIC_FDMI_ACTIVE; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h index 86e293ce530d..c2fdc6553e62 100644 --- a/drivers/scsi/fnic/fnic.h +++ b/drivers/scsi/fnic/fnic.h @@ -30,7 +30,7 @@ #define DRV_NAME "fnic" #define DRV_DESCRIPTION "Cisco FCoE HBA Driver" -#define DRV_VERSION "1.8.0.1" +#define DRV_VERSION "1.8.0.2" #define PFX DRV_NAME ": " #define DFX DRV_NAME "%d: " From 9b9b8594654a79e3d4166356fd86cd5397477b24 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:30 -0700 Subject: [PATCH 098/297] scsi: fnic: Add and improve logs in FDMI and FDMI ABTS paths Add logs in FDMI and FDMI ABTS paths. Modify log text in these paths. Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Reviewed-by: John Meneghini Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-3-kartilak@cisco.com Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 65 +++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index fa9cf0b37d72..ae37f85f618b 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -791,6 +791,7 @@ static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport, static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) { uint8_t *frame; + struct fnic *fnic = iport->fnic; unsigned long fdmi_tov; uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + sizeof(struct fc_frame_header); @@ -801,6 +802,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) if (frame == NULL) return; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI PLOGI abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_plogi); fnic_send_fcoe_frame(iport, frame, frame_size); } else { if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { @@ -809,6 +813,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) if (frame == NULL) return; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI RHBA abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_rhba); fnic_send_fcoe_frame(iport, frame, frame_size); } if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { @@ -821,6 +828,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) return; } + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI RPA abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_rpa); fnic_send_fcoe_frame(iport, frame, frame_size); } } @@ -829,6 +839,10 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov); mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov)); iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING; + + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); } static void fdls_send_fabric_flogi(struct fnic_iport_s *iport) @@ -2294,7 +2308,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) spin_lock_irqsave(&fnic->fnic_lock, flags); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); if (!iport->fabric.fdmi_pending) { /* timer expired after fdmi responses received. */ @@ -2302,7 +2316,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) return; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); /* if not abort pending, send an abort */ if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) { @@ -2311,26 +2325,37 @@ void fdls_fdmi_timer_callback(struct timer_list *t) return; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); /* ABTS pending for an active fdmi request that is pending. * That means FDMI ABTS timed out * Schedule to free the OXID after 2*r_a_tov and proceed */ if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI PLOGI ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_plogi); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_plogi); } else { - if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI RHBA ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_rhba); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_rhba); - if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) + } + if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI RPA ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_rpa); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_rpa); + } } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); fdls_fdmi_retry_plogi(iport); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); spin_unlock_irqrestore(&fnic->fnic_lock, flags); } @@ -3745,12 +3770,26 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, switch (FNIC_FRAME_TYPE(oxid)) { case FNIC_FRAME_TYPE_FDMI_PLOGI: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI PLOGI ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi); iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING; iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; case FNIC_FRAME_TYPE_FDMI_RHBA: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI RHBA ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING; /* If RPA is still pending, don't turn off ABORT PENDING. @@ -3761,8 +3800,17 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; case FNIC_FRAME_TYPE_FDMI_RPA: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI RPA ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING; /* If RHBA is still pending, don't turn off ABORT PENDING. @@ -3773,6 +3821,9 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; default: FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, From 18b5cb6f1fdda4454f55a31f7c78d94da62be495 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:31 -0700 Subject: [PATCH 099/297] scsi: fnic: Set appropriate logging level for log message Replace KERN_INFO with KERN_DEBUG for a log message. Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/stable/20250612002212.4144-1-kartilak%40cisco.com Link: https://lore.kernel.org/r/20250618003431.6314-4-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index 7133b254cbe4..75b29a018d1f 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -1046,7 +1046,7 @@ static void fnic_fcpio_icmnd_cmpl_handler(struct fnic *fnic, unsigned int cq_ind if (icmnd_cmpl->scsi_status == SAM_STAT_TASK_SET_FULL) atomic64_inc(&fnic_stats->misc_stats.queue_fulls); - FNIC_SCSI_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + FNIC_SCSI_DBG(KERN_DEBUG, fnic->host, fnic->fnic_num, "xfer_len: %llu", xfer_len); break; From 85d6fbc47c3087c5d048e6734926b0c36af34fe9 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Wed, 18 Jun 2025 08:57:04 +0200 Subject: [PATCH 100/297] scsi: fnic: Fix missing DMA mapping error in fnic_send_frame() dma_map_XXX() can fail and should be tested for errors with dma_mapping_error(). Fixes: a63e78eb2b0f ("scsi: fnic: Add support for fabric based solicited requests and responses") Signed-off-by: Thomas Fourier Link: https://lore.kernel.org/r/20250618065715.14740-2-fourier.thomas@gmail.com Reviewed-by: Karan Tilak Kumar Reviewed-by: John Menghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_fcs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/fnic/fnic_fcs.c b/drivers/scsi/fnic/fnic_fcs.c index 1e8cd64f9a5c..103ab6f1f7cd 100644 --- a/drivers/scsi/fnic/fnic_fcs.c +++ b/drivers/scsi/fnic/fnic_fcs.c @@ -636,6 +636,8 @@ static int fnic_send_frame(struct fnic *fnic, void *frame, int frame_len) unsigned long flags; pa = dma_map_single(&fnic->pdev->dev, frame, frame_len, DMA_TO_DEVICE); + if (dma_mapping_error(&fnic->pdev->dev, pa)) + return -ENOMEM; if ((fnic_fc_trace_set_data(fnic->fnic_num, FNIC_FC_SEND | 0x80, (char *) frame, From a05dd8ae5cbb1cb45f349922cfea4f548a5e5d6f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Jun 2025 01:17:51 +0800 Subject: [PATCH 101/297] mm/shmem, swap: fix softlockup with mTHP swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following softlockup can be easily reproduced on my test machine with: echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled swapon /dev/zram0 # zram0 is a 48G swap device mkdir -p /sys/fs/cgroup/memory/test echo 1G > /sys/fs/cgroup/test/memory.max echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs while true; do dd if=/dev/zero of=/tmp/test.img bs=1M count=5120 cat /tmp/test.img > /dev/null rm /tmp/test.img done Then after a while: watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787] Modules linked in: zram virtiofs CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)· Tainted: [L]=SOFTLOCKUP Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015 RIP: 0010:mpol_shared_policy_lookup+0xd/0x70 Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8 RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202 RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001 RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518 RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001 R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000 FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: shmem_alloc_folio+0x31/0xc0 shmem_swapin_folio+0x309/0xcf0 ? filemap_get_entry+0x117/0x1e0 ? xas_load+0xd/0xb0 ? filemap_get_entry+0x101/0x1e0 shmem_get_folio_gfp+0x2ed/0x5b0 shmem_file_read_iter+0x7f/0x2e0 vfs_read+0x252/0x330 ksys_read+0x68/0xf0 do_syscall_64+0x4c/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f03f9a46991 Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991 RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003 RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380 R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000 R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000 The reason is simple, readahead brought some order 0 folio in swap cache, and the swapin mTHP folio being allocated is in conflict with it, so swapcache_prepare fails and causes shmem_swap_alloc_folio to return -EEXIST, and shmem simply retries again and again causing this loop. Fix it by applying a similar fix for anon mTHP swapin. The performance change is very slight, time of swapin 10g zero folios with shmem (test for 12 times): Before: 2.47s After: 2.48s [kasong@tencent.com: add comment] Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device") Signed-off-by: Kairui Song Reviewed-by: Barry Song Acked-by: Nhat Pham Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Chris Li Cc: Hugh Dickins Cc: Kemeng Shi Cc: Usama Arif Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 20 -------------------- mm/shmem.c | 6 +++++- mm/swap.h | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8eba595056fe..b0cda5aab398 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4315,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2259,6 +2259,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2272,9 +2273,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ diff --git a/mm/swap.h b/mm/swap.h index 2269eb9df0af..9096082a915e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ /** From 965f87700adbcc6d72430f524d88135027f5bba3 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Mon, 9 Jun 2025 12:06:07 +0000 Subject: [PATCH 102/297] selftests/mm: increase timeout from 180 to 900 seconds The mm selftests are timing out with the current 180-second limit. Testing shows that run_vmtests.sh takes approximately 11 minutes (664 seconds) to complete. Increase the timeout to 900 seconds (15 minutes) to provide sufficient buffer for the tests to complete successfully. Link: https://lkml.kernel.org/r/20250609120606.73145-2-shivankg@amd.com Signed-off-by: Shivank Garg Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index a953c96aa16e..e2206265f67c 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=180 +timeout=900 From 517f496e1e61bd169d585dab4dd77e7147506322 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 11 Jun 2025 15:13:14 +0200 Subject: [PATCH 103/297] mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked" After commit 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") we are able to longterm pin folios that are not supposed to get longterm pinned, simply because they temporarily have the LRU flag cleared (esp. temporarily isolated). For example, two __get_longterm_locked() callers can race, or __get_longterm_locked() can race with anything else that temporarily isolates folios. The introducing commit mentions the use case of a driver that uses vm_ops->fault to insert pages allocated through cma_alloc() into the page tables, assuming they can later get longterm pinned. These pages/ folios would never have the LRU flag set and consequently cannot get isolated. There is no known in-tree user making use of that so far, fortunately. To handle that in the future -- and avoid retrying forever to isolate/migrate them -- we will need a different mechanism for the CMA area *owner* to indicate that it actually already allocated the page and is fine with longterm pinning it. The LRU flag is not suitable for that. Probably we can lookup the relevant CMA area and query the bitmap; we only have have to care about some races, probably. If already allocated, we could just allow longterm pinning) Anyhow, let's fix the "must not be longterm pinned" problem first by reverting the original commit. Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") Signed-off-by: David Hildenbrand Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/ Reported-by: Hyesoo Yu Reviewed-by: John Hubbard Cc: Jason Gunthorpe Cc: Peter Xu Cc: Zhaoyang Huang Cc: Aijun Sun Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index e065a49842a8..3c39cbbeebef 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_folios *pofs) /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { + unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); @@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_folios( if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2418,9 +2422,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); From 0ea148a799198518d8ebab63ddd0bb6114a103bc Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 4 Jun 2025 23:10:38 +0800 Subject: [PATCH 104/297] mm: userfaultfd: fix race of userfaultfd_move and swap cache This commit fixes two kinds of races, they may have different results: Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same BUG_ON if the filemap lookup returned NULL and folio is added to swap cache after that. If another kind of race is triggered (folio changed after lookup) we may see RSS counter is corrupted: [ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0 type:MM_ANONPAGES val:-1 [ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0 type:MM_SHMEMPAGES val:1 Because the folio is being accounted to the wrong VMA. I'm not sure if there will be any data corruption though, seems no. The issues above are critical already. On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache lookup, and tries to move the found folio to the faulting vma. Currently, it relies on checking the PTE value to ensure that the moved folio still belongs to the src swap entry and that no new folio has been added to the swap cache, which turns out to be unreliable. While working and reviewing the swap table series with Barry, following existing races are observed and reproduced [1]: In the example below, move_pages_pte is moving src_pte to dst_pte, where src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the swap cache: CPU1 CPU2 userfaultfd_move move_pages_pte() entry = pte_to_swp_entry(orig_src_pte); // Here it got entry = S1 ... < interrupted> ... // folio A is a new allocated folio // and get installed into src_pte // src_pte now points to folio A, S1 // has swap count == 0, it can be freed // by folio_swap_swap or swap // allocator's reclaim. // folio B is a folio in another VMA. // S1 is freed, folio B can use it // for swap out with no problem. ... folio = filemap_get_folio(S1) // Got folio B here !!! ... < interrupted again> ... // Now S1 is free to be used again. // Now src_pte is a swap entry PTE // holding S1 again. folio_trylock(folio) move_swap_pte double_pt_lock is_pte_pages_stable // Check passed because src_pte == S1 folio_move_anon_rmap(...) // Moved invalid folio B here !!! The race window is very short and requires multiple collisions of multiple rare events, so it's very unlikely to happen, but with a deliberately constructed reproducer and increased time window, it can be reproduced easily. This can be fixed by checking if the folio returned by filemap is the valid swap cache folio after acquiring the folio lock. Another similar race is possible: filemap_get_folio may return NULL, but folio (A) could be swapped in and then swapped out again using the same swap entry after the lookup. In such a case, folio (A) may remain in the swap cache, so it must be moved too: CPU1 CPU2 userfaultfd_move move_pages_pte() entry = pte_to_swp_entry(orig_src_pte); // Here it got entry = S1, and S1 is not in swap cache folio = filemap_get_folio(S1) // Got NULL ... < interrupted again> ... move_swap_pte double_pt_lock is_pte_pages_stable // Check passed because src_pte == S1 folio_move_anon_rmap(...) // folio A is ignored !!! Fix this by checking the swap cache again after acquiring the src_pte lock. And to avoid the filemap overhead, we check swap_map directly [2]. The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far we don't need to worry about that, since folios can only be exposed to the swap cache in the swap out path, and this is covered in this patch by checking the swap cache again after acquiring the src_pte lock. Testing with a simple C program that allocates and moves several GB of memory did not show any observable performance change. Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Kairui Song Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoSt=9+JwP7Q@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8eT=A@mail.gmail.com/ [2] Reviewed-by: Lokesh Gidra Acked-by: Peter Xu Reviewed-by: Suren Baghdasaryan Reviewed-by: Barry Song Reviewed-by: Chris Li Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Kairui Song Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..8253978ee0fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1412,7 +1441,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: From 417d145c2e71ad7362200ede6e8afdfc6e56c4fb Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Fri, 13 Jun 2025 15:19:14 +0200 Subject: [PATCH 105/297] MAINTAINERS: add linux-mm@ list to Kexec Handover Along with kexec, KHO also has parts dealing with memory management, like page/folio initialization, memblock, and preserving/unpreserving memory for next kernel. Copy linux-mm@ to KHO patches so the right set of eyes can look at changes to those parts. Link: https://lkml.kernel.org/r/20250613131917.4488-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0c1d245bf7b8..9a7fdc52cf9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13347,6 +13347,7 @@ M: Alexander Graf M: Mike Rapoport M: Changyuan Lyu L: kexec@lists.infradead.org +L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* From 12b9a2c05d1b474518b0f5fac4a50b7f93b16930 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 5 Jun 2025 19:11:41 +0200 Subject: [PATCH 106/297] kho: initialize tail pages for higher order folios properly Currently, when restoring higher order folios, kho_restore_folio() only calls prep_compound_page() on all the pages. That is not enough to properly initialize the folios. The managed page count does not get updated, the reserved flag does not get dropped, and page count does not get initialized properly. Restoring a higher order folio with it results in the following BUG with CONFIG_DEBUG_VM when attempting to free the folio: BUG: Bad page state in process test pfn:104e2b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffffffffffffffff pfn:0x104e2b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 0000000000000000 00000000ffffffff 0000000000000000 raw: ffffffffffffffff 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount [...] Call Trace: dump_stack_lvl+0x4b/0x70 bad_page.cold+0x97/0xb2 __free_frozen_pages+0x616/0x850 [...] Combine the path for 0-order and higher order folios, initialize the tail pages with a count of zero, and call adjust_managed_page_count() to account for all the pages instead of just missing them. In addition, since all the KHO-preserved pages get marked with MEMBLOCK_RSRV_NOINIT by deserialize_bitmap(), the reserved flag is not actually set (as can also be seen from the flags of the dumped page in the logs above). So drop the ClearPageReserved() calls. [ptyadav@amazon.de: declare i in the loop instead of at the top] Link: https://lkml.kernel.org/r/20250613125916.39272-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20250605171143.76963-1-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); From 223731cd63004cb07edfc6257d53565995c00cbb Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 13 Jun 2025 09:19:12 +0530 Subject: [PATCH 107/297] selftests/mm: add configs to fix testcase failure If CONFIG_UPROBES is not set, a merge subtest fails: Failure log: 7151 12:46:54.627936 # # # RUN merge.handle_uprobe_upon_merged_vma ... 7152 12:46:54.639014 # # f /sys/bus/event_source/devices/uprobe/type 7153 12:46:54.639306 # # fopen: No such file or directory 7154 12:46:54.650451 # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0) 7155 12:46:54.650730 # # # handle_uprobe_upon_merged_vma: Test terminated by assertion 7156 12:46:54.661750 # # # FAIL merge.handle_uprobe_upon_merged_vma 7157 12:46:54.662030 # # not ok 8 merge.handle_uprobe_upon_merged_vma CONFIG_UPROBES is enabled by CONFIG_UPROBE_EVENTS, which gets enabled by CONFIG_FTRACE. Therefore add these configs to selftests/mm/config so that CI systems can include this config in the kernel build. To be completely safe, add CONFIG_PROFILING too, to enable the dependency chain PROFILING -> PERF_EVENTS -> UPROBE_EVENTS -> UPROBES. Link: https://lkml.kernel.org/r/20250613034912.53791-1-dev.jain@arm.com Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge") Signed-off-by: Dev Jain Reported-by: Aishwarya Closes: https://lore.kernel.org/all/20250610103729.72440-1-aishwarya.tcv@arm.com/ Tested-by: Aishwarya TCV Tested-by : Donet Tom Reviewed-by: Lorenzo Stoakes Reviewed-by: Anshuman Khandual Reviewed-by: Mark Brown Reviewed-by: Donet Tom Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Pu Lehui Cc: Ryan Roberts Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index a28baa536332..deba93379c80 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -8,3 +8,6 @@ CONFIG_GUP_TEST=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ANON_VMA_NAME=y +CONFIG_FTRACE=y +CONFIG_PROFILING=y +CONFIG_UPROBES=y From 845f1f2d69f3f49b3d8c142265952c8257e3368c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:51 +0800 Subject: [PATCH 108/297] Revert "bcache: update min_heap_callbacks to use default builtin swap" Patch series "bcache: Revert min_heap migration due to performance regression". This patch series reverts the migration of bcache from its original heap implementation to the generic min_heap library. While the original change aimed to simplify the code and improve maintainability, it introduced a severe performance regression in real-world scenarios. As reported by Robert, systems using bcache now suffer from periodic latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. This degrades bcache's value as a low-latency caching layer, and leads to frequent timeouts and application stalls in production environments. The primary cause of this regression is the behavior of the generic min_heap implementation's bottom-up sift_down, which performs up to 2 * log2(n) comparisons when many elements are equal. The original top-down variant used by bcache only required O(1) comparisons in such cases. The issue was further exacerbated by commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"), which introduced non-inlined versions of the min_heap API, adding function call overhead to a performance-critical hot path. This patch (of 3): This reverts commit 3d8a9a1c35227c3f1b0bd132c9f0a80dbda07b65. Although removing the custom swap function simplified the code, this change is part of a broader migration to the generic min_heap API that introduced significant performance regressions in bcache. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. This revert is part of a series of changes to restore previous performance by undoing the min_heap transition. Link: https://lkml.kernel.org/r/20250614202353.1632957-1-visitorckw@gmail.com Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-2-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/alloc.c | 11 +++++++++-- drivers/md/bcache/bset.c | 14 +++++++++++--- drivers/md/bcache/extents.c | 10 +++++++++- drivers/md/bcache/movinggc.c | 10 +++++++++- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8998e61efa40..da50f6661bae 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -189,16 +189,23 @@ static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); } +static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **lhs = l, **rhs = r; + + swap(*lhs, *rhs); +} + static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; const struct min_heap_callbacks bucket_max_cmp_callback = { .less = new_bucket_max_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; const struct min_heap_callbacks bucket_min_cmp_callback = { .less = new_bucket_min_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; ca->heap.nr = 0; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 68258a16e125..bd97d8626887 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1093,6 +1093,14 @@ static inline bool new_btree_iter_cmp(const void *l, const void *r, void __alway return bkey_cmp(_l->k, _r->k) <= 0; } +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) +{ + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; + + swap(*_iter1, *_iter2); +} + static inline bool btree_iter_end(struct btree_iter *iter) { return !iter->heap.nr; @@ -1103,7 +1111,7 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, { const struct min_heap_callbacks callbacks = { .less = new_btree_iter_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; if (k != end) @@ -1149,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *ret = NULL; const struct min_heap_callbacks callbacks = { .less = cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; if (!btree_iter_end(iter)) { @@ -1223,7 +1231,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, : bch_ptr_invalid; const struct min_heap_callbacks callbacks = { .less = b->ops->sort_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; /* Heapify the iterator, using our comparison function */ diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 4b84fda1530a..a7221e5dbe81 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -266,12 +266,20 @@ static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_ return !(c ? c > 0 : _l->k < _r->k); } +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) +{ + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; + + swap(*_iter1, *_iter2); +} + static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { const struct min_heap_callbacks callbacks = { .less = new_bch_extent_sort_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; while (iter->heap.nr > 1) { struct btree_iter_set *top = iter->heap.data, *i = top + 1; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 45ca134cbf02..d6c73dd8eb2b 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -190,6 +190,14 @@ static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *a return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); } +static void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **_l = l; + struct bucket **_r = r; + + swap(*_l, *_r); +} + static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; @@ -204,7 +212,7 @@ void bch_moving_gc(struct cache_set *c) unsigned long sectors_to_move, reserve_sectors; const struct min_heap_callbacks callbacks = { .less = new_bucket_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; if (!c->copy_gc_enabled) From 48fd7ebe00c1cdc782b42576548b25185902f64c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:52 +0800 Subject: [PATCH 109/297] Revert "bcache: remove heap-related macros and switch to generic min_heap" This reverts commit 866898efbb25bb44fd42848318e46db9e785973a. The generic bottom-up min_heap implementation causes performance regression in invalidate_buckets_lru(), a hot path in bcache. Before the cache is fully populated, new_bucket_prio() often returns zero, leading to many equal comparisons. In such cases, bottom-up sift_down performs up to 2 * log2(n) comparisons, while the original top-down approach completes with just O() comparisons, resulting in a measurable performance gap. The performance degradation is further worsened by the non-inlined min_heap API functions introduced in commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"), adding function call overhead to this critical path. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. This revert aims to restore bcache's original low-latency behavior. Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/alloc.c | 64 +++++------------- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/bset.c | 124 ++++++++++++---------------------- drivers/md/bcache/bset.h | 40 ++++++----- drivers/md/bcache/btree.c | 69 ++++++++----------- drivers/md/bcache/extents.c | 51 +++++--------- drivers/md/bcache/movinggc.c | 41 +++-------- drivers/md/bcache/super.c | 3 +- drivers/md/bcache/sysfs.c | 4 +- drivers/md/bcache/util.h | 67 +++++++++++++++++- drivers/md/bcache/writeback.c | 13 ++-- 11 files changed, 216 insertions(+), 262 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index da50f6661bae..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -164,68 +164,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) * prio is worth 1/8th of what INITIAL_PRIO is worth. */ -static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) -{ - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; +#define bucket_prio(b) \ +({ \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) - return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); -} - -static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); -} - -static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); -} - -static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **lhs = l, **rhs = r; - - swap(*lhs, *rhs); -} +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; - const struct min_heap_callbacks bucket_max_cmp_callback = { - .less = new_bucket_max_cmp, - .swp = new_bucket_swap, - }; - const struct min_heap_callbacks bucket_min_cmp_callback = { - .less = new_bucket_min_cmp, - .swp = new_bucket_swap, - }; + ssize_t i; - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (!bch_can_invalidate_bucket(ca, b)) continue; - if (!min_heap_full(&ca->heap)) - min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); - else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); + heap_sift(&ca->heap, 0, bucket_max_cmp); } } - min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); while (!fifo_full(&ca->free_inc)) { - if (!ca->heap.nr) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { /* * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything @@ -234,8 +206,6 @@ static void invalidate_buckets_lru(struct cache *ca) wake_up_gc(ca->set); return; } - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); bch_invalidate_one_bucket(ca, b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 785b0d9008fa..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -458,7 +458,7 @@ struct cache { /* Allocation stuff: */ struct bucket *buckets; - DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; + DECLARE_HEAP(struct bucket *, heap); /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index bd97d8626887..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - if (b->ops->is_extents) for_each_key(b, k, &iter) ret += KEY_SIZE(k); @@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key(b, k, &iter) { if (b->ops->is_extents) { err = "Keys out of order"; @@ -114,9 +110,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) static void bch_btree_iter_next_check(struct btree_iter *iter) { - struct bkey *k = iter->heap.data->k, *next = bkey_next(k); + struct bkey *k = iter->data->k, *next = bkey_next(k); - if (next < iter->heap.data->end && + if (next < iter->data->end && bkey_cmp(k, iter->b->ops->is_extents ? &START_KEY(next) : next) > 0) { bch_dump_bucket(iter->b); @@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set @@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1083,102 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, /* Btree iterator */ -typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); -static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - const struct btree_iter_set *_l = l; - const struct btree_iter_set *_r = r; - - return bkey_cmp(_l->k, _r->k) <= 0; -} - -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) { - return !iter->heap.nr; + return !iter->used; } void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end) { - const struct min_heap_callbacks callbacks = { - .less = new_btree_iter_cmp, - .swp = new_btree_iter_swap, - }; - if (k != end) - BUG_ON(!min_heap_push(&iter->heap, - &((struct btree_iter_set) { k, end }), - &callbacks, - NULL)); + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); - iter->heap.nr = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, - new_btree_iter_cmp_fn *cmp) + btree_iter_cmp_fn *cmp) { struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = cmp, - .swp = new_btree_iter_swap, - }; if (!btree_iter_end(iter)) { bch_btree_iter_next_check(iter); - ret = iter->heap.data->k; - iter->heap.data->k = bkey_next(iter->heap.data->k); + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); - if (iter->heap.data->k > iter->heap.data->end) { + if (iter->data->k > iter->data->end) { WARN_ONCE(1, "bset was corrupt!\n"); - iter->heap.data->k = iter->heap.data->end; + iter->data->k = iter->data->end; } - if (iter->heap.data->k == iter->heap.data->end) { - if (iter->heap.nr) { - b = min_heap_peek(&iter->heap)[0]; - min_heap_pop(&iter->heap, &callbacks, NULL); - } - } + if (iter->data->k == iter->data->end) + heap_pop(iter, b, cmp); else - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, cmp); } return ret; @@ -1186,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *bch_btree_iter_next(struct btree_iter *iter) { - return __bch_btree_iter_next(iter, new_btree_iter_cmp); + return __bch_btree_iter_next(iter, btree_iter_cmp); } @@ -1224,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; - const struct min_heap_callbacks callbacks = { - .less = b->ops->sort_cmp, - .swp = new_btree_iter_swap, - }; /* Heapify the iterator, using our comparison function */ - min_heapify_all(&iter->heap, &callbacks, NULL); + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); while (!btree_iter_end(iter)) { if (b->ops->sort_fixup && fixup) @@ -1324,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1339,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1355,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_stack_init(b, &iter, NULL); - bch_btree_iter_init(b, &iter, NULL); - - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index f79441acd4c1..011f6062c4c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -187,9 +187,8 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(const void *l, - const void *r, - void *args); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); struct bkey *(*sort_fixup)(struct btree_iter *iter, struct bkey *tmp); bool (*insert_fixup)(struct btree_keys *b, @@ -313,17 +312,23 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; -struct btree_iter_set { - struct bkey *k, *end; -}; - /* Btree key iteration */ struct btree_iter { + size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif - MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; + struct btree_iter_set { + struct bkey *k, *end; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + struct btree_iter iter; + struct btree_iter_set stack_data[MAX_BSETS]; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -335,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -352,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1d0100677357..210b59007d98 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -148,19 +148,19 @@ void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); - struct btree_iter iter; + struct btree_iter *iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ - iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; - iter.heap.nr = 0; + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter.b = &b->keys; + iter->b = &b->keys; #endif if (!i->seq) @@ -198,7 +198,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -210,7 +210,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; @@ -222,7 +222,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: - mempool_free(iter.heap.data, &b->c->fill_iter); + mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); @@ -1306,11 +1306,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { @@ -1569,11 +1567,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1612,18 +1608,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1918,9 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1928,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1959,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1967,11 +1961,9 @@ static int bch_btree_check_thread(void *arg) cur_idx = prev_idx = 0; ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1989,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2062,11 +2054,9 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); @@ -2560,12 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2594,12 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index a7221e5dbe81..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter, i->k = bkey_next(i->k); if (i->k == i->end) - *i = iter->heap.data[--iter->heap.nr]; + *i = iter->data[--iter->used]; } -static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(_l->k, _r->k); + int64_t c = bkey_cmp(l.k, r.k); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, } const struct btree_keys_ops bch_btree_keys_ops = { - .sort_cmp = new_bch_key_sort_cmp, + .sort_cmp = bch_key_sort_cmp, .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, @@ -256,36 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = { * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ - -static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - return !(c ? c > 0 : _l->k < _r->k); -} - -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); + return c ? c > 0 : l.k < r.k; } static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { - const struct min_heap_callbacks callbacks = { - .less = new_bch_extent_sort_cmp, - .swp = new_btree_iter_swap, - }; - while (iter->heap.nr > 1) { - struct btree_iter_set *top = iter->heap.data, *i = top + 1; + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; - if (iter->heap.nr > 2 && - !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -293,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); continue; } @@ -303,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, else bch_cut_front(top->k, i->k); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -313,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, bch_cut_back(&START_KEY(i->k), tmp); bch_cut_front(i->k, top->k); - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, bch_extent_sort_cmp); return tmp; } else { @@ -633,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk, } const struct btree_keys_ops bch_extent_keys_ops = { - .sort_cmp = new_bch_extent_sort_cmp, + .sort_cmp = bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d6c73dd8eb2b..26a6a535ec32 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -182,27 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private)) closure_sync(&cl); } -static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) +static bool bucket_cmp(struct bucket *l, struct bucket *r) { - struct bucket **_l = (struct bucket **)l; - struct bucket **_r = (struct bucket **)r; - - return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); -} - -static void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **_l = l; - struct bucket **_r = r; - - swap(*_l, *_r); + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -210,10 +199,6 @@ void bch_moving_gc(struct cache_set *c) struct cache *ca = c->cache; struct bucket *b; unsigned long sectors_to_move, reserve_sectors; - const struct min_heap_callbacks callbacks = { - .less = new_bucket_cmp, - .swp = new_bucket_swap, - }; if (!c->copy_gc_enabled) return; @@ -224,7 +209,7 @@ void bch_moving_gc(struct cache_set *c) reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || @@ -233,31 +218,25 @@ void bch_moving_gc(struct cache_set *c) atomic_read(&b->pin)) continue; - if (!min_heap_full(&ca->heap)) { + if (!heap_full(&ca->heap)) { sectors_to_move += GC_SECTORS_USED(b); - min_heap_push(&ca->heap, &b, &callbacks, NULL); - } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { sectors_to_move -= bucket_heap_top(ca); sectors_to_move += GC_SECTORS_USED(b); ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); + heap_sift(&ca->heap, 0, bucket_cmp); } } while (sectors_to_move > reserve_sectors) { - if (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); - } + heap_pop(&ca->heap, b, bucket_cmp); sectors_to_move -= GC_SECTORS_USED(b); } - while (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); + while (heap_pop(&ca->heap, b, bucket_cmp)) SET_GC_MOVE(b, 1); - } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1efb768b2890..2ea490b9d370 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1912,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e8f696cb58c0..826b14cae4e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,9 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; goto lock_root; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 539454d8e2d0..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -31,10 +30,16 @@ struct closure; #endif +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + #define init_heap(heap, _size, gfp) \ ({ \ size_t _bytes; \ - (heap)->nr = 0; \ + (heap)->used = 0; \ (heap)->size = (_size); \ _bytes = (heap)->size * sizeof(*(heap)->data); \ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ @@ -47,6 +52,64 @@ do { \ (heap)->data = NULL; \ } while (0) +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 453efbbdc8ee..302e75f1fc4b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,16 +908,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -931,7 +930,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -980,13 +979,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - retry_lock: b = c->root; rw_lock(0, b, b->level); From 95b2e31e1752494d477c5da89d6789f769b0d67b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:53 +0800 Subject: [PATCH 110/297] bcache: remove unnecessary select MIN_HEAP After reverting the transition to the generic min heap library, bcache no longer depends on MIN_HEAP. The select entry can be removed to reduce code size and shrink the kernel's attack surface. This change effectively reverts the bcache-related part of commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"). This is part of a series of changes to address a performance regression caused by the use of the generic min_heap implementation. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-4-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d4697e79d5a3..b2d10063d35f 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,7 +5,6 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES - select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. From 3333871296efd52ef6f6d5cce5a92dc7b06ba879 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Tue, 10 Jun 2025 13:22:09 +0100 Subject: [PATCH 111/297] selftests/mm: skip uprobe vma merge test if uprobes are not enabled If uprobes are not enabled, the test currently fails with: 7151 12:46:54.627936 # # # RUN merge.handle_uprobe_upon_merged_vma ... 7152 12:46:54.639014 # # f /sys/bus/event_source/devices/uprobe/type 7153 12:46:54.639306 # # fopen: No such file or directory 7154 12:46:54.650451 # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0) 7155 12:46:54.650730 # # # handle_uprobe_upon_merged_vma: Test terminated by assertion 7156 12:46:54.661750 # # # FAIL merge.handle_uprobe_upon_merged_vma 7157 12:46:54.662030 # # not ok 8 merge.handle_uprobe_upon_merged_vma Skipping is a more sane and friendly behavior here. Link: https://lkml.kernel.org/r/20250610122209.3177587-1-pfalcato@suse.de Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge") Signed-off-by: Pedro Falcato Reported-by: Aishwarya Closes: https://lore.kernel.org/linux-mm/20250610103729.72440-1-aishwarya.tcv@arm.com/ Reviewed-by: Lorenzo Stoakes Tested-by : Donet Tom Reviewed-by : Donet Tom Reviewed-by: Dev Jain Reviewed-by: Pu Lehui Cc: Jann Horn Cc: Liam Howlett Cc: Mark Brown Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index bbae66fc5038..cc26480098ae 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -470,7 +470,9 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_GE(fd, 0); ASSERT_EQ(ftruncate(fd, page_size), 0); - ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0); + if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) { + SKIP(goto out, "Failed to read uprobe sysfs file, skipping"); + } memset(&attr, 0, attr_sz); attr.size = attr_sz; @@ -491,6 +493,7 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_NE(mremap(ptr2, page_size, page_size, MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED); +out: close(fd); remove(probe_file); } From 38103247777695ca2b09691b2247e66f157908c6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 16 Jun 2025 21:16:43 +0100 Subject: [PATCH 112/297] MAINTAINERS: add missing mm/workingset.c file to mm reclaim section The working set logic belongs very much to the reclaim section and is otherwise not assigned to any other MAINTAINERS section so add it here. Link: https://lkml.kernel.org/r/20250616201643.561626-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Shakeel Butt Acked-by: Qi Zheng Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9a7fdc52cf9d..0497cbcc5d3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15871,6 +15871,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/pt_reclaim.c F: mm/vmscan.c +F: mm/workingset.c MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) M: Andrew Morton From 40ffd2887635c492b758d8b02172c97f5d42f593 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 16 Jun 2025 21:08:44 +0100 Subject: [PATCH 113/297] MAINTAINERS: add missing test files to mm gup section We previously overlooked GUP test files that sensibly should belong to the GUP section, include them now. Link: https://lkml.kernel.org/r/20250616200844.560225-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Jason Gunthorpe Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0497cbcc5d3e..a70454f527c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15795,6 +15795,10 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: mm/gup.c +F: mm/gup_test.c +F: mm/gup_test.h +F: tools/testing/selftests/mm/gup_longterm.c +F: tools/testing/selftests/mm/gup_test.c MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) M: Andrew Morton From fba46a5d83ca8decb338722fb4899026d8d9ead2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 16 Jun 2025 14:45:20 -0400 Subject: [PATCH 114/297] maple_tree: fix MA_STATE_PREALLOC flag in mas_preallocate() Temporarily clear the preallocation flag when explicitly requesting allocations. Pre-existing allocations are already counted against the request through mas_node_count_gfp(), but the allocations will not happen if the MA_STATE_PREALLOC flag is set. This flag is meant to avoid re-allocating in bulk allocation mode, and to detect issues with preallocation calculations. The MA_STATE_PREALLOC flag should also always be set on zero allocations so that detection of underflow allocations will print a WARN_ON() during consumption. User visible effect of this flaw is a WARN_ON() followed by a null pointer dereference when subsequent requests for larger number of nodes is ignored, such as the vma merge retry in mmap_region() caused by drivers altering the vma flags (which happens in v6.6, at least) Link: https://lkml.kernel.org/r/20250616184521.3382795-3-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Zhaoyang Huang Reported-by: Hailong Liu Link: https://lore.kernel.org/all/1652f7eb-a51b-4fee-8058-c73af63bacd1@oppo.com/ Link: https://lore.kernel.org/all/20250428184058.1416274-1-Liam.Howlett@oracle.com/ Link: https://lore.kernel.org/all/20250429014754.1479118-1-Liam.Howlett@oracle.com/ Cc: Lorenzo Stoakes Cc: Suren Baghdasaryan Cc: Hailong Liu Cc: zhangpeng.00@bytedance.com Cc: Steve Kang Cc: Matthew Wilcox Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index affe979bd14d..00524e55a21e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5527,8 +5527,9 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(&wr_mas, entry); if (!request) - return ret; + goto set_flag; + mas->mas_flags &= ~MA_STATE_PREALLOC; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); @@ -5538,6 +5539,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) return ret; } +set_flag: mas->mas_flags |= MA_STATE_PREALLOC; return ret; } From 883cf5b0b8389610f17106c454180c7f54b8d486 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 17 Jun 2025 20:59:10 +0200 Subject: [PATCH 115/297] MAINTAINERS: update maintainers for HugeTLB Change my role to Maintainer as I am quite involved in HugeTLB development, and will be more so with the upcoming HugetLB-pagewalk unification, so I would like to help Munchun take care of the code. Besides, having two people will help in offloading some pressure. Also add David as a Reviewer since he has quite some knowledge in the field and has already provided valuable feedback. Link: https://lkml.kernel.org/r/20250617185910.471406-1-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Muchun Song Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a70454f527c3..a2c8e4546cf8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11157,7 +11157,8 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song -R: Oscar Salvador +M: Oscar Salvador +R: David Hildenbrand L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages From b6d19f3742ff3429d8eb2cdf51a8ab598c197ee5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 18:45:38 +0100 Subject: [PATCH 116/297] MAINTAINERS: add further init files to mm init block These files comprise the bootmem info logic which is initialised on startup and also memory tests that are run on startup and as such this seems the most appropriate section for them. Link: https://lkml.kernel.org/r/20250617174538.188977-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a2c8e4546cf8..1f87cb444c83 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15680,8 +15680,11 @@ S: Maintained F: Documentation/core-api/boot-time-mm.rst F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h +F: mm/bootmem_info.c F: mm/memblock.c +F: mm/memtest.c F: mm/mm_init.c +F: mm/rodata_test.c F: tools/testing/memblock/ MEMORY ALLOCATION PROFILING From d91b00b687abf6fef13be030abd848416f320149 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 18:15:38 +0100 Subject: [PATCH 117/297] MAINTAINERS: add hugetlb_cgroup.c to hugetlb section This file is clearly specific to hugetlb so this seems the most appropriate place for it. Link: https://lkml.kernel.org/r/20250617171538.178042-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1f87cb444c83..cf9ea0425011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11169,6 +11169,7 @@ F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: include/trace/events/hugetlbfs.h F: mm/hugetlb.c +F: mm/hugetlb_cgroup.c F: mm/hugetlb_cma.c F: mm/hugetlb_cma.h F: mm/hugetlb_vmemmap.c From a1540dcbe0246c42ee271a3b7bba5ec7c933321a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 17:51:42 +0100 Subject: [PATCH 118/297] MAINTAINERS: add stray rmap file to mm rmap section page_vma_mapped_walk() is used to traverse page tables from a VMA, used by rmap logic once the reverse mapping has been traversed to the VMA level. It is also used by other users (migration, damon, etc.) but is primarily used by the reverse mapping and is a key part of its logic, so it seems appropriate to place it here. Link: https://lkml.kernel.org/r/20250617165142.173716-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cf9ea0425011..49fcc5d02df7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15893,6 +15893,7 @@ R: Harry Yoo L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: mm/page_vma_mapped.c F: mm/rmap.c MEMORY MANAGEMENT - SECRETMEM From db5921ab8aa23142b277325192a7443601dc4534 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 17:13:59 +0100 Subject: [PATCH 119/297] MAINTAINERS: add memfd, shmem quota files to shmem section These files seem best suited to shmem. Link: https://lkml.kernel.org/r/20250617161359.166955-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 49fcc5d02df7..0eec6e1d4694 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15740,7 +15740,6 @@ F: Documentation/admin-guide/mm/ F: Documentation/mm/ F: include/linux/gfp.h F: include/linux/gfp_types.h -F: include/linux/memfd.h F: include/linux/memory_hotplug.h F: include/linux/memory-tiers.h F: include/linux/mempolicy.h @@ -25042,8 +25041,11 @@ M: Hugh Dickins R: Baolin Wang L: linux-mm@kvack.org S: Maintained +F: include/linux/memfd.h F: include/linux/shmem_fs.h +F: mm/memfd.c F: mm/shmem.c +F: mm/shmem_quota.c TOMOYO SECURITY MODULE M: Kentaro Takeda From c742d127d2d831aa83ae2987a508bca2bf0c7736 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 15:41:30 +0100 Subject: [PATCH 120/297] MAINTAINERS: add additional mmap-related files to mmap section msync and nommu are directly related to memory mapping, mincore is less so but all are roughly speaking operating on virtual memory mappings from the point of view of the user so this seems the most appropriate place for them. Link: https://lkml.kernel.org/r/20250617144130.147847-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Acked-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Jann Horn Signed-off-by: Andrew Morton --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0eec6e1d4694..ca8a70f7b047 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15985,11 +15985,14 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: mm/mincore.c F: mm/mlock.c F: mm/mmap.c F: mm/mprotect.c F: mm/mremap.c F: mm/mseal.c +F: mm/msync.c +F: mm/nommu.c F: mm/vma.c F: mm/vma.h F: mm/vma_exec.c From 4540e41e753a7d69ecd3f5bad51fe620205c3a18 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Sun, 15 Jun 2025 23:59:41 +0100 Subject: [PATCH 121/297] HID: appletb-kbd: fix "appletb_backlight" backlight device reference counting During appletb_kbd_probe, probe attempts to get the backlight device by name. When this happens backlight_device_get_by_name looks for a device in the backlight class which has name "appletb_backlight" and upon finding a match it increments the reference count for the device and returns it to the caller. However this reference is never released leading to a reference leak. Fix this by decrementing the backlight device reference count on removal via put_device and on probe failure. Fixes: 93a0fc489481 ("HID: hid-appletb-kbd: add support for automatic brightness control while using the touchbar") Cc: stable@vger.kernel.org Signed-off-by: Qasim Ijaz Reviewed-by: Aditya Garg Signed-off-by: Jiri Kosina --- drivers/hid/hid-appletb-kbd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c index ef51b2c06872..e06567886e50 100644 --- a/drivers/hid/hid-appletb-kbd.c +++ b/drivers/hid/hid-appletb-kbd.c @@ -438,6 +438,8 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id return 0; close_hw: + if (kbd->backlight_dev) + put_device(&kbd->backlight_dev->dev); hid_hw_close(hdev); stop_hw: hid_hw_stop(hdev); @@ -453,6 +455,9 @@ static void appletb_kbd_remove(struct hid_device *hdev) input_unregister_handler(&kbd->inp_handler); timer_delete_sync(&kbd->inactivity_timer); + if (kbd->backlight_dev) + put_device(&kbd->backlight_dev->dev); + hid_hw_close(hdev); hid_hw_stop(hdev); } From a8905238c3bbe13db90065ed74682418f23830c3 Mon Sep 17 00:00:00 2001 From: Akira Inoue Date: Thu, 12 Jun 2025 13:34:38 +0900 Subject: [PATCH 122/297] HID: lenovo: Add support for ThinkPad X1 Tablet Thin Keyboard Gen2 Add "Thinkpad X1 Tablet Gen 2 Keyboard" PID to hid-lenovo driver to fix trackpoint not working issue. Signed-off-by: Akira Inoue Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-lenovo.c | 8 ++++++++ drivers/hid/hid-multitouch.c | 8 +++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 2d3769405ec3..c6468568aea1 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -821,6 +821,7 @@ #define USB_DEVICE_ID_LENOVO_TPPRODOCK 0x6067 #define USB_DEVICE_ID_LENOVO_X1_COVER 0x6085 #define USB_DEVICE_ID_LENOVO_X1_TAB 0x60a3 +#define USB_DEVICE_ID_LENOVO_X1_TAB2 0x60a4 #define USB_DEVICE_ID_LENOVO_X1_TAB3 0x60b5 #define USB_DEVICE_ID_LENOVO_X12_TAB 0x60fe #define USB_DEVICE_ID_LENOVO_X12_TAB2 0x61ae diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index a3c23a72316a..b3121fa7a72d 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -492,6 +492,7 @@ static int lenovo_input_mapping(struct hid_device *hdev, case USB_DEVICE_ID_LENOVO_X12_TAB: case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: return lenovo_input_mapping_x1_tab_kbd(hdev, hi, field, usage, bit, max); default: @@ -608,6 +609,7 @@ static ssize_t attr_fn_lock_store(struct device *dev, case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_TP10UBKBD: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: ret = lenovo_led_set_tp10ubkbd(hdev, TP10UBKBD_FN_LOCK_LED, value); if (ret) @@ -864,6 +866,7 @@ static int lenovo_event(struct hid_device *hdev, struct hid_field *field, case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_TP10UBKBD: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: return lenovo_event_tp10ubkbd(hdev, field, usage, value); default: @@ -1147,6 +1150,7 @@ static int lenovo_led_brightness_set(struct led_classdev *led_cdev, case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_TP10UBKBD: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: ret = lenovo_led_set_tp10ubkbd(hdev, tp10ubkbd_led[led_nr], value); break; @@ -1387,6 +1391,7 @@ static int lenovo_probe(struct hid_device *hdev, case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_TP10UBKBD: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: ret = lenovo_probe_tp10ubkbd(hdev); break; @@ -1476,6 +1481,7 @@ static void lenovo_remove(struct hid_device *hdev) case USB_DEVICE_ID_LENOVO_X12_TAB2: case USB_DEVICE_ID_LENOVO_TP10UBKBD: case USB_DEVICE_ID_LENOVO_X1_TAB: + case USB_DEVICE_ID_LENOVO_X1_TAB2: case USB_DEVICE_ID_LENOVO_X1_TAB3: lenovo_remove_tp10ubkbd(hdev); break; @@ -1526,6 +1532,8 @@ static const struct hid_device_id lenovo_devices[] = { */ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_TAB) }, + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_TAB2) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_TAB3) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index ded0fef7d8c7..24aa6e7e6fdd 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -2132,12 +2132,18 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_GENERIC, USB_VENDOR_ID_LG, I2C_DEVICE_ID_LG_7010) }, - /* Lenovo X1 TAB Gen 2 */ + /* Lenovo X1 TAB Gen 1 */ { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT, HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_TAB) }, + /* Lenovo X1 TAB Gen 2 */ + { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT, + HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8, + USB_VENDOR_ID_LENOVO, + USB_DEVICE_ID_LENOVO_X1_TAB2) }, + /* Lenovo X1 TAB Gen 3 */ { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT, HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8, From 89a33de314945c866b155d369f224fa552af1722 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Tue, 10 Jun 2025 19:30:37 +0530 Subject: [PATCH 123/297] Bluetooth: btintel_pcie: Fix potential race condition in firmware download During firmware download, if an error occurs, interrupts must be disabled, synchronized, and re-enabled before retrying the download. This change ensures proper interrupt handling to prevent race conditions. Signed-off-by: Chandrashekar Devegowda Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 33 ++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 563165c5efae..e1c688dd2d45 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -2033,6 +2033,28 @@ static void btintel_pcie_release_hdev(struct btintel_pcie_data *data) data->hdev = NULL; } +static void btintel_pcie_disable_interrupts(struct btintel_pcie_data *data) +{ + spin_lock(&data->irq_lock); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_FH_INT_MASK, data->fh_init_mask); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_HW_INT_MASK, data->hw_init_mask); + spin_unlock(&data->irq_lock); +} + +static void btintel_pcie_enable_interrupts(struct btintel_pcie_data *data) +{ + spin_lock(&data->irq_lock); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_FH_INT_MASK, ~data->fh_init_mask); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_HW_INT_MASK, ~data->hw_init_mask); + spin_unlock(&data->irq_lock); +} + +static void btintel_pcie_synchronize_irqs(struct btintel_pcie_data *data) +{ + for (int i = 0; i < data->alloc_vecs; i++) + synchronize_irq(data->msix_entries[i].vector); +} + static int btintel_pcie_setup_internal(struct hci_dev *hdev) { struct btintel_pcie_data *data = hci_get_drvdata(hdev); @@ -2152,6 +2174,8 @@ static int btintel_pcie_setup(struct hci_dev *hdev) bt_dev_err(hdev, "Firmware download retry count: %d", fw_dl_retry); btintel_pcie_dump_debug_registers(hdev); + btintel_pcie_disable_interrupts(data); + btintel_pcie_synchronize_irqs(data); err = btintel_pcie_reset_bt(data); if (err) { bt_dev_err(hdev, "Failed to do shr reset: %d", err); @@ -2159,6 +2183,7 @@ static int btintel_pcie_setup(struct hci_dev *hdev) } usleep_range(10000, 12000); btintel_pcie_reset_ia(data); + btintel_pcie_enable_interrupts(data); btintel_pcie_config_msix(data); err = btintel_pcie_enable_bt(data); if (err) { @@ -2291,6 +2316,12 @@ static void btintel_pcie_remove(struct pci_dev *pdev) data = pci_get_drvdata(pdev); + btintel_pcie_disable_interrupts(data); + + btintel_pcie_synchronize_irqs(data); + + flush_work(&data->rx_work); + btintel_pcie_reset_bt(data); for (int i = 0; i < data->alloc_vecs; i++) { struct msix_entry *msix_entry; @@ -2303,8 +2334,6 @@ static void btintel_pcie_remove(struct pci_dev *pdev) btintel_pcie_release_hdev(data); - flush_work(&data->rx_work); - destroy_workqueue(data->workqueue); btintel_pcie_free(data); From 042bb9603c44620dce98717a2d23235ca57a00d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= Date: Thu, 12 Jun 2025 09:50:34 +0200 Subject: [PATCH 124/297] Bluetooth: L2CAP: Fix L2CAP MTU negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OBEX download from iPhone is currently slow due to small packet size used to transfer data which doesn't follow the MTU negotiated during L2CAP connection, i.e. 672 bytes instead of 32767: < ACL Data TX: Handle 11 flags 0x00 dlen 12 L2CAP: Connection Request (0x02) ident 18 len 4 PSM: 4103 (0x1007) Source CID: 72 > ACL Data RX: Handle 11 flags 0x02 dlen 16 L2CAP: Connection Response (0x03) ident 18 len 8 Destination CID: 14608 Source CID: 72 Result: Connection successful (0x0000) Status: No further information available (0x0000) < ACL Data TX: Handle 11 flags 0x00 dlen 27 L2CAP: Configure Request (0x04) ident 20 len 19 Destination CID: 14608 Flags: 0x0000 Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 26 L2CAP: Configure Request (0x04) ident 72 len 18 Destination CID: 72 Flags: 0x0000 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 0 Monitor timeout: 0 Maximum PDU size: 65527 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) < ACL Data TX: Handle 11 flags 0x00 dlen 29 L2CAP: Configure Response (0x05) ident 72 len 21 Source CID: 14608 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 672 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 32 L2CAP: Configure Response (0x05) ident 20 len 24 Source CID: 72 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) ... > ACL Data RX: Handle 11 flags 0x02 dlen 680 Channel: 72 len 676 ctrl 0x0202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 1 ReqSeq 2 < ACL Data TX: Handle 11 flags 0x00 dlen 13 Channel: 14608 len 9 ctrl 0x0204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 2 ReqSeq 2 > ACL Data RX: Handle 11 flags 0x02 dlen 680 Channel: 72 len 676 ctrl 0x0304 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 2 ReqSeq 3 The MTUs are negotiated for each direction. In this traces 32767 for iPhone->localhost and no MTU for localhost->iPhone, which based on '4.4 L2CAP_CONFIGURATION_REQ' (Core specification v5.4, Vol. 3, Part A): The only parameters that should be included in the L2CAP_CONFIGURATION_REQ packet are those that require different values than the default or previously agreed values. ... Any missing configuration parameters are assumed to have their most recently explicitly or implicitly accepted values. and '5.1 Maximum transmission unit (MTU)': If the remote device sends a positive L2CAP_CONFIGURATION_RSP packet it should include the actual MTU to be used on this channel for traffic flowing into the local device. ... The default value is 672 octets. is set by BlueZ to 672 bytes. It seems that the iPhone used the lowest negotiated value to transfer data to the localhost instead of the negotiated one for the incoming direction. This could be fixed by using the MTU negotiated for the other direction, if exists, in the L2CAP_CONFIGURATION_RSP. This allows to use segmented packets as in the following traces: < ACL Data TX: Handle 11 flags 0x00 dlen 12 L2CAP: Connection Request (0x02) ident 22 len 4 PSM: 4103 (0x1007) Source CID: 72 < ACL Data TX: Handle 11 flags 0x00 dlen 27 L2CAP: Configure Request (0x04) ident 24 len 19 Destination CID: 2832 Flags: 0x0000 Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 26 L2CAP: Configure Request (0x04) ident 15 len 18 Destination CID: 72 Flags: 0x0000 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 0 Monitor timeout: 0 Maximum PDU size: 65527 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) < ACL Data TX: Handle 11 flags 0x00 dlen 29 L2CAP: Configure Response (0x05) ident 15 len 21 Source CID: 2832 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 32 L2CAP: Configure Response (0x05) ident 24 len 24 Source CID: 72 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) ... > ACL Data RX: Handle 11 flags 0x02 dlen 1009 Channel: 72 len 1005 ctrl 0x4202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Start (len 21884) TxSeq 1 ReqSeq 2 > ACL Data RX: Handle 11 flags 0x02 dlen 1009 Channel: 72 len 1005 ctrl 0xc204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Continuation TxSeq 2 ReqSeq 2 This has been tested with kernel 5.4 and BlueZ 5.77. Cc: stable@vger.kernel.org Signed-off-by: Frédéric Danis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a5bde5db58ef..40daa38276f3 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3415,7 +3415,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; struct l2cap_conf_efs efs; u8 remote_efs = 0; - u16 mtu = L2CAP_DEFAULT_MTU; + u16 mtu = 0; u16 result = L2CAP_CONF_SUCCESS; u16 size; @@ -3520,6 +3520,13 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data /* Configure output options and let the other side know * which ones we don't like. */ + /* If MTU is not provided in configure request, use the most recently + * explicitly or implicitly accepted value for the other direction, + * or the default value. + */ + if (mtu == 0) + mtu = chan->imtu ? chan->imtu : L2CAP_DEFAULT_MTU; + if (mtu < L2CAP_DEFAULT_MIN_MTU) result = L2CAP_CONF_UNACCEPT; else { From db0ff7e15923ffa7067874604ca275e92343f1b1 Mon Sep 17 00:00:00 2001 From: Shuai Zhang Date: Mon, 9 Jun 2025 18:55:00 +0800 Subject: [PATCH 125/297] driver: bluetooth: hci_qca:fix unable to load the BT driver Some modules have BT_EN enabled via a hardware pull-up, meaning it is not defined in the DTS and is not controlled through the power sequence. In such cases, fall through to follow the legacy flow. Signed-off-by: Shuai Zhang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 5fe5879881f5..3ec0be496820 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2392,10 +2392,17 @@ static int qca_serdev_probe(struct serdev_device *serdev) */ qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev, "bluetooth"); - if (IS_ERR(qcadev->bt_power->pwrseq)) - return PTR_ERR(qcadev->bt_power->pwrseq); - break; + /* + * Some modules have BT_EN enabled via a hardware pull-up, + * meaning it is not defined in the DTS and is not controlled + * through the power sequence. In such cases, fall through + * to follow the legacy flow. + */ + if (IS_ERR(qcadev->bt_power->pwrseq)) + qcadev->bt_power->pwrseq = NULL; + else + break; } fallthrough; case QCA_WCN3950: From 850f0e2433cdd38f36d80a4c1ab59f82029bef74 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Wed, 18 Jun 2025 20:54:57 -0700 Subject: [PATCH 126/297] MAINTAINERS: Update Drew Fustini's email address Switch from personal domain to kernel.org address. Signed-off-by: Drew Fustini Link: https://lore.kernel.org/r/20250619035457.331065-1-fustini@kernel.org Signed-off-by: Palmer Dabbelt --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index c8a21d72241f..1ce0dd52dfaf 100644 --- a/.mailmap +++ b/.mailmap @@ -222,6 +222,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert +Drew Fustini Ed L. Cashin Elliot Berman Enric Balletbo i Serra diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..dcb1142d10b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21374,7 +21374,7 @@ N: spacemit K: spacemit RISC-V THEAD SoC SUPPORT -M: Drew Fustini +M: Drew Fustini M: Guo Ren M: Fu Wei L: linux-riscv@lists.infradead.org From 64f7548aad63d2fbca2eeb6eb33361c218ebd5a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 21:19:40 +0200 Subject: [PATCH 127/297] lib/crypto: sha256: Mark sha256_choose_blocks as __always_inline When the compiler chooses to not inline sha256_choose_blocks() in the purgatory code, it fails to link against the missing CPU specific version: x86_64-linux-ld: arch/x86/purgatory/purgatory.ro: in function `sha256_choose_blocks.part.0': sha256.c:(.text+0x6a6): undefined reference to `irq_fpu_usable' sha256.c:(.text+0x6c7): undefined reference to `sha256_blocks_arch' sha256.c:(.text+0x6cc): undefined reference to `sha256_blocks_simd' Mark this function as __always_inline to prevent this, same as sha256_finup(). Fixes: 5b90a779bc54 ("crypto: lib/sha256 - Add helpers for block-based shash") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250620191952.1867578-1-arnd@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/sha2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h index b9bccd3ff57f..21a27fd5e198 100644 --- a/include/crypto/internal/sha2.h +++ b/include/crypto/internal/sha2.h @@ -25,7 +25,7 @@ void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); -static inline void sha256_choose_blocks( +static __always_inline void sha256_choose_blocks( u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks, bool force_generic, bool force_simd) { From 0b39b055b5b48cbbdf5746a1ca6e3f6b0221e537 Mon Sep 17 00:00:00 2001 From: Xiaowei Li Date: Fri, 20 Jun 2025 10:27:02 +0800 Subject: [PATCH 128/297] net: usb: qmi_wwan: add SIMCom 8230C composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for SIMCom 8230C which is based on Qualcomm SDX35 chip. 0x9071: tty (DM) + tty (NMEA) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=05 Cnt=02 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1e0e ProdID=9071 Rev= 5.15 S: Manufacturer=SIMCOM S: Product=SDXBAAGHA-IDP _SN:D744C4C5 S: SerialNumber=0123456789ABCDEF C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=none E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Xiaowei Li Acked-by: Bjørn Mork Link: https://patch.msgid.link/tencent_21D781FAA4969FEACA6ABB460362B52C9409@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index b586b1c13a47..f5647ee0adde 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1426,6 +1426,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ + {QMI_QUIRK_SET_DTR(0x1e0e, 0x9071, 3)}, /* SIMCom 8230C ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ From 714db279942b1fb9b97c4243e186825a96750239 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 19 Jun 2025 14:16:07 -0700 Subject: [PATCH 129/297] CREDITS: Add entry for Shannon Nelson I'm retiring and have already had my name removed from MAINTAINERS. A couple of folks kindly suggested I should have an entry here. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250619211607.1244217-1-sln@onemain.com Signed-off-by: Jakub Kicinski --- CREDITS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CREDITS b/CREDITS index 45446ae322ec..c30b75f9a732 100644 --- a/CREDITS +++ b/CREDITS @@ -2981,6 +2981,11 @@ S: 521 Pleasant Valley Road S: Potsdam, New York 13676 S: USA +N: Shannon Nelson +E: sln@onemain.com +D: Worked on several network drivers including +D: ixgbe, i40e, ionic, pds_core, pds_vdpa, pds_fwctl + N: Dave Neuer E: dave.neuer@pobox.com D: Helped implement support for Compaq's H31xx series iPAQs From bb378314ceee4d181e26bfe180deca852ae80c5c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 16:51:32 -0400 Subject: [PATCH 130/297] bcachefs: Add missing bch2_err_class() to fileattr_set() Make sure we return a standard error code. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 3063a8ddc2df..db24a76563f8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1732,7 +1732,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; + + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { From abcb6bd4be19d935795611c022d7d19ab69363b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 17:06:43 -0400 Subject: [PATCH 131/297] bcachefs: fix spurious error_throw Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e76809e71858..77d93beb3c8f 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -353,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node))) + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); From 72c0d9cb0fc48b6d382c3b5b6f702108a612cfdb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Jun 2025 23:06:01 -0400 Subject: [PATCH 132/297] bcachefs: Fix range in bch2_lookup_indirect_extent() error path Before calling bch2_indirect_extent_missing_error(), we have to calculate the missing range, which is the intersection of the reflink pointer and the non-indirect-extent we found. The calculation didn't take into account that the returned extent may span the iter position, leading to an infinite loop when we (unnecessarily) resized the extent we were returning to one that didn't extend past the offset we were looking up. Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index a535abd44df3..92b90cfe622b 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -64,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); + + if (REFLINK_P_ERROR(p.v)) + prt_str(out, " error"); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -269,13 +272,12 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - unsigned size = min((u64) k.k->size, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - - reflink_offset); - bch2_key_resize(&iter->k, size); + u64 missing_end = min(k.k->p.offset, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); + BUG_ON(reflink_offset == missing_end); int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - k.k->p.offset, should_commit); + missing_end, should_commit); if (ret) { bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); From e41687b511d5e5437db5d2151e23c115dba30411 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Fri, 20 Jun 2025 14:43:29 -0600 Subject: [PATCH 133/297] ALSA: hda/realtek: Add quirks for some Clevo laptops Add audio quirks to fix speaker output and headset detection on the following Clevo models: - V350ENC - V350WNPQ - V540TU - X560WNR - X580WNS Signed-off-by: Tim Crawford Link: https://patch.msgid.link/20250620204329.35878-1-tcrawford@system76.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2e1618494c20..2149dcd34bc6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2656,6 +2656,7 @@ static const struct hda_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x147b, 0x107a, "Abit AW9D-MAX", ALC882_FIXUP_ABIT_AW9D_MAX), SND_PCI_QUIRK(0x1558, 0x3702, "Clevo X370SN[VW]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x50d3, "Clevo PC50[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x5802, "Clevo X58[05]WN[RST]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65d1, "Clevo PB51[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65d2, "Clevo PB51R[CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e1, "Clevo PB51[ED][DF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), @@ -11135,6 +11136,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x14a1, "Clevo L141MU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x2624, "Clevo L240TU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x28c1, "Clevo V370VND", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1558, 0x35a1, "Clevo V3[56]0EN[CDE]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x35b1, "Clevo V3[57]0WN[MNP]Q", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4018, "Clevo NV40M[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4019, "Clevo NV40MZ", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4020, "Clevo NV40MB", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), @@ -11162,6 +11165,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x51b1, "Clevo NS50AU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x51b3, "Clevo NS70AU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x5630, "Clevo NP50RNJS", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x5700, "Clevo X560WN[RST]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70a1, "Clevo NB70T[HJK]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70b3, "Clevo NK70SB", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70f2, "Clevo NH79EPY", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), @@ -11201,6 +11205,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0xa650, "Clevo NP[567]0SN[CD]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa671, "Clevo NP70SN[CDE]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa741, "Clevo V54x_6x_TNE", ALC245_FIXUP_CLEVO_NOISY_MIC), + SND_PCI_QUIRK(0x1558, 0xa743, "Clevo V54x_6x_TU", ALC245_FIXUP_CLEVO_NOISY_MIC), SND_PCI_QUIRK(0x1558, 0xa763, "Clevo V54x_6x_TU", ALC245_FIXUP_CLEVO_NOISY_MIC), SND_PCI_QUIRK(0x1558, 0xb018, "Clevo NP50D[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), From 68cc9d3c8e44afe90e43cbbd2960da15c2f31e23 Mon Sep 17 00:00:00 2001 From: Yasmin Fitzgerald Date: Sat, 21 Jun 2025 01:36:14 -0400 Subject: [PATCH 134/297] ALSA: hda/realtek - Enable mute LED on HP Pavilion Laptop 15-eg100 The HP Pavilion Laptop 15-eg100 has Realtek HDA codec ALC287. It needs the ALC287_FIXUP_HP_GPIO_LED quirk to enable the mute LED. Signed-off-by: Yasmin Fitzgerald Link: https://patch.msgid.link/20250621053832.52950-1-sunoflife1.git@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2149dcd34bc6..e144ebb08841 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10738,6 +10738,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8975, "HP EliteBook x360 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x897d, "HP mt440 Mobile Thin Client U74", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8981, "HP Elite Dragonfly G3", ALC245_FIXUP_CS35L41_SPI_4), + SND_PCI_QUIRK(0x103c, 0x898a, "HP Pavilion 15-eg100", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x898e, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x898f, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8991, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), From 302251f1fdfd302ce99a619aac1a5164d0bb7c4b Mon Sep 17 00:00:00 2001 From: Faisal Bukhari Date: Sat, 21 Jun 2025 16:02:04 +0530 Subject: [PATCH 135/297] Fix typo in marvell octeontx2 documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst Fixes a spelling mistake: "funcionality" → "functionality". Signed-off-by: Faisal Bukhari Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../networking/device_drivers/ethernet/marvell/octeontx2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst index af7db0e91f6b..a52850602cd8 100644 --- a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst +++ b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst @@ -66,7 +66,7 @@ Admin Function driver As mentioned above RVU PF0 is called the admin function (AF), this driver supports resource provisioning and configuration of functional blocks. Doesn't handle any I/O. It sets up few basic stuff but most of the -funcionality is achieved via configuration requests from PFs and VFs. +functionality is achieved via configuration requests from PFs and VFs. PF/VFs communicates with AF via a shared memory region (mailbox). Upon receiving requests AF does resource provisioning and other HW configuration. From b993ea46b3b601915ceaaf3c802adf11e7d6bac6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 14:28:44 +0000 Subject: [PATCH 136/297] atm: clip: prevent NULL deref in clip_push() Blamed commit missed that vcc_destroy_socket() calls clip_push() with a NULL skb. If clip_devs is NULL, clip_push() then crashes when reading skb->truesize. Fixes: 93a2014afbac ("atm: fix a UAF in lec_arp_clear_vccs()") Reported-by: syzbot+1316233c4c6803382a8b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68556f59.a00a0220.137b3.004e.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Gengming Liu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/atm/clip.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/atm/clip.c b/net/atm/clip.c index 61b5b700817d..b234dc3bcb0d 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -193,12 +193,6 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("\n"); - if (!clip_devs) { - atm_return(vcc, skb->truesize); - kfree_skb(skb); - return; - } - if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) @@ -208,6 +202,11 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) return; } atm_return(vcc, skb->truesize); + if (!clip_devs) { + kfree_skb(skb); + return; + } + skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { From 999fb9d51f939ee23cbb9313ae558d29d6987804 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Tue, 17 Jun 2025 14:20:12 +0200 Subject: [PATCH 137/297] ASoC: qcom: sm8250: Fix possibly undefined reference With CONFIG_SND_SOC_SM8250=y and CONFIG_SND_SOC_QCOM_OFFLOAD_UTILS=m selected in kconfig, the build will fail due to trying to link against a symbol only found in the module. aarch64-linux-gnu-ld: sound/soc/qcom/sm8250.o: in function `sm8250_snd_exit': sound/soc/qcom/sm8250.c:52:(.text+0x210): undefined reference to `qcom_snd_usb_offload_jack_remove' Fix this by declaring the dependency that forces CONFIG_SND_SOC_SM8250=m when CONFIG_SND_SOC_QCOM_OFFLOAD_UTILS is =m. Reported-by: Matthew Croughan Fixes: 1b8d0d87b934 ("ASoC: qcom: qdsp6: Add headphone jack for offload connection status") Signed-off-by: Luca Weiss Link: https://patch.msgid.link/20250617-snd-sm8250-dep-fix-v1-1-879af8906ec4@fairphone.com Signed-off-by: Mark Brown --- sound/soc/qcom/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/Kconfig b/sound/soc/qcom/Kconfig index e86b4a03dd61..3d9ba13ee1e5 100644 --- a/sound/soc/qcom/Kconfig +++ b/sound/soc/qcom/Kconfig @@ -186,6 +186,7 @@ config SND_SOC_SM8250 tristate "SoC Machine driver for SM8250 boards" depends on QCOM_APR && SOUNDWIRE depends on COMMON_CLK + depends on SND_SOC_QCOM_OFFLOAD_UTILS || !SND_SOC_QCOM_OFFLOAD_UTILS select SND_SOC_QDSP6 select SND_SOC_QCOM_COMMON select SND_SOC_QCOM_SDW From 7186b81807b4a08f8bf834b6bdc72d6ed8ba1587 Mon Sep 17 00:00:00 2001 From: Yuzuru10 Date: Sun, 22 Jun 2025 22:58:00 +0000 Subject: [PATCH 138/297] ASoC: amd: yc: add quirk for Acer Nitro ANV15-41 internal mic This patch adds DMI-based quirk for the Acer Nitro ANV15-41, allowing the internal microphone to be detected correctly on machines with "RB" as board vendor. Signed-off-by: Yuzuru Link: https://patch.msgid.link/20250622225754.20856-1-yuzuru_10@proton.me Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 98022e5fd428..499f9f7c76ee 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -353,6 +353,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "83Q3"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "RB"), + DMI_MATCH(DMI_PRODUCT_NAME, "Nitro ANV15-41"), + } + }, { .driver_data = &acp6x_card, .matches = { From bf39286adc5e10ce3e32eb86ad316ae56f3b52a0 Mon Sep 17 00:00:00 2001 From: Oliver Schramm Date: Sun, 22 Jun 2025 00:30:01 +0200 Subject: [PATCH 139/297] ASoC: amd: yc: Add DMI quirk for Lenovo IdeaPad Slim 5 15 It's smaller brother has already received the patch to enable the microphone, now add it too to the DMI quirk table. Cc: stable@vger.kernel.org Signed-off-by: Oliver Schramm Link: https://patch.msgid.link/20250621223000.11817-2-oliver.schramm97@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 499f9f7c76ee..97e340140d0c 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -367,6 +367,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "83J2"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83J3"), + } + }, { .driver_data = &acp6x_card, .matches = { From fb721b2c35b1829b8ecf62e3adb41cf30260316a Mon Sep 17 00:00:00 2001 From: Louis Chauvet Date: Tue, 29 Apr 2025 10:36:23 +0200 Subject: [PATCH 140/297] drm: writeback: Fix drm_writeback_connector_cleanup signature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_writeback_connector_cleanup have the signature: static void drm_writeback_connector_cleanup( struct drm_device *dev, struct drm_writeback_connector *wb_connector) But it is stored and used as a drmres_release_t typedef void (*drmres_release_t)(struct drm_device *dev, void *res); While the current code is valid and does not produce any warning, the CFI runtime check (CONFIG_CFI_CLANG) can fail because the function signature is not the same as drmres_release_t. In order to fix this, change the function signature to match what is expected by drmres_release_t. Fixes: 1914ba2b91ea ("drm: writeback: Create drmm variants for drm_writeback_connector initialization") Suggested-by: Mark Yacoub Reviewed-by: Maíra Canal Link: https://lore.kernel.org/r/20250429-drm-fix-writeback-cleanup-v2-1-548ff3a4e284@bootlin.com Signed-off-by: Louis Chauvet --- drivers/gpu/drm/drm_writeback.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c index edbeab88ff2b..d983ee85cf13 100644 --- a/drivers/gpu/drm/drm_writeback.c +++ b/drivers/gpu/drm/drm_writeback.c @@ -343,17 +343,18 @@ EXPORT_SYMBOL(drm_writeback_connector_init_with_encoder); /** * drm_writeback_connector_cleanup - Cleanup the writeback connector * @dev: DRM device - * @wb_connector: Pointer to the writeback connector to clean up + * @data: Pointer to the writeback connector to clean up * * This will decrement the reference counter of blobs and destroy properties. It * will also clean the remaining jobs in this writeback connector. Caution: This helper will not * clean up the attached encoder and the drm_connector. */ static void drm_writeback_connector_cleanup(struct drm_device *dev, - struct drm_writeback_connector *wb_connector) + void *data) { unsigned long flags; struct drm_writeback_job *pos, *n; + struct drm_writeback_connector *wb_connector = data; delete_writeback_properties(dev); drm_property_blob_put(wb_connector->pixel_formats_blob_ptr); @@ -405,7 +406,7 @@ int drmm_writeback_connector_init(struct drm_device *dev, if (ret) return ret; - ret = drmm_add_action_or_reset(dev, (void *)drm_writeback_connector_cleanup, + ret = drmm_add_action_or_reset(dev, drm_writeback_connector_cleanup, wb_connector); if (ret) return ret; From 20d71750cc72e80859d52548cf5c2a7513983b0d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 20 Jun 2025 20:15:03 +0800 Subject: [PATCH 141/297] crypto: wp512 - Use API partial block handling Use the Crypto API partial block handling. Signed-off-by: Herbert Xu Tested-by: Milan Broz Signed-off-by: Herbert Xu --- crypto/wp512.c | 125 +++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/crypto/wp512.c b/crypto/wp512.c index 41f13d490333..229b189a7988 100644 --- a/crypto/wp512.c +++ b/crypto/wp512.c @@ -21,10 +21,10 @@ */ #include #include +#include #include -#include -#include -#include +#include +#include #define WP512_DIGEST_SIZE 64 #define WP384_DIGEST_SIZE 48 @@ -37,9 +37,6 @@ struct wp512_ctx { u8 bitLength[WP512_LENGTHBYTES]; - u8 buffer[WP512_BLOCK_SIZE]; - int bufferBits; - int bufferPos; u64 hash[WP512_DIGEST_SIZE/8]; }; @@ -779,16 +776,16 @@ static const u64 rc[WHIRLPOOL_ROUNDS] = { * The core Whirlpool transform. */ -static __no_kmsan_checks void wp512_process_buffer(struct wp512_ctx *wctx) { +static __no_kmsan_checks void wp512_process_buffer(struct wp512_ctx *wctx, + const u8 *buffer) { int i, r; u64 K[8]; /* the round key */ u64 block[8]; /* mu(buffer) */ u64 state[8]; /* the cipher state */ u64 L[8]; - const __be64 *buffer = (const __be64 *)wctx->buffer; for (i = 0; i < 8; i++) - block[i] = be64_to_cpu(buffer[i]); + block[i] = get_unaligned_be64(buffer + i * 8); state[0] = block[0] ^ (K[0] = wctx->hash[0]); state[1] = block[1] ^ (K[1] = wctx->hash[1]); @@ -991,8 +988,6 @@ static int wp512_init(struct shash_desc *desc) { int i; memset(wctx->bitLength, 0, 32); - wctx->bufferBits = wctx->bufferPos = 0; - wctx->buffer[0] = 0; for (i = 0; i < 8; i++) { wctx->hash[i] = 0L; } @@ -1000,84 +995,54 @@ static int wp512_init(struct shash_desc *desc) { return 0; } -static int wp512_update(struct shash_desc *desc, const u8 *source, - unsigned int len) +static void wp512_add_length(u8 *bitLength, u64 value) { - struct wp512_ctx *wctx = shash_desc_ctx(desc); - int sourcePos = 0; - unsigned int bits_len = len * 8; // convert to number of bits - int sourceGap = (8 - ((int)bits_len & 7)) & 7; - int bufferRem = wctx->bufferBits & 7; + u32 carry; int i; - u32 b, carry; - u8 *buffer = wctx->buffer; - u8 *bitLength = wctx->bitLength; - int bufferBits = wctx->bufferBits; - int bufferPos = wctx->bufferPos; - u64 value = bits_len; for (i = 31, carry = 0; i >= 0 && (carry != 0 || value != 0ULL); i--) { carry += bitLength[i] + ((u32)value & 0xff); bitLength[i] = (u8)carry; carry >>= 8; value >>= 8; } - while (bits_len > 8) { - b = ((source[sourcePos] << sourceGap) & 0xff) | - ((source[sourcePos + 1] & 0xff) >> (8 - sourceGap)); - buffer[bufferPos++] |= (u8)(b >> bufferRem); - bufferBits += 8 - bufferRem; - if (bufferBits == WP512_BLOCK_SIZE * 8) { - wp512_process_buffer(wctx); - bufferBits = bufferPos = 0; - } - buffer[bufferPos] = b << (8 - bufferRem); - bufferBits += bufferRem; - bits_len -= 8; - sourcePos++; - } - if (bits_len > 0) { - b = (source[sourcePos] << sourceGap) & 0xff; - buffer[bufferPos] |= b >> bufferRem; - } else { - b = 0; - } - if (bufferRem + bits_len < 8) { - bufferBits += bits_len; - } else { - bufferPos++; - bufferBits += 8 - bufferRem; - bits_len -= 8 - bufferRem; - if (bufferBits == WP512_BLOCK_SIZE * 8) { - wp512_process_buffer(wctx); - bufferBits = bufferPos = 0; - } - buffer[bufferPos] = b << (8 - bufferRem); - bufferBits += (int)bits_len; - } - - wctx->bufferBits = bufferBits; - wctx->bufferPos = bufferPos; - - return 0; } -static int wp512_final(struct shash_desc *desc, u8 *out) +static int wp512_update(struct shash_desc *desc, const u8 *source, + unsigned int len) +{ + struct wp512_ctx *wctx = shash_desc_ctx(desc); + unsigned int remain = len % WP512_BLOCK_SIZE; + u64 bits_len = (len - remain) * 8ull; + u8 *bitLength = wctx->bitLength; + + wp512_add_length(bitLength, bits_len); + do { + wp512_process_buffer(wctx, source); + source += WP512_BLOCK_SIZE; + bits_len -= WP512_BLOCK_SIZE * 8; + } while (bits_len); + + return remain; +} + +static int wp512_finup(struct shash_desc *desc, const u8 *src, + unsigned int bufferPos, u8 *out) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int i; - u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; - int bufferBits = wctx->bufferBits; - int bufferPos = wctx->bufferPos; __be64 *digest = (__be64 *)out; + u8 buffer[WP512_BLOCK_SIZE]; - buffer[bufferPos] |= 0x80U >> (bufferBits & 7); + wp512_add_length(bitLength, bufferPos * 8); + memcpy(buffer, src, bufferPos); + buffer[bufferPos] = 0x80U; bufferPos++; if (bufferPos > WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { if (bufferPos < WP512_BLOCK_SIZE) memset(&buffer[bufferPos], 0, WP512_BLOCK_SIZE - bufferPos); - wp512_process_buffer(wctx); + wp512_process_buffer(wctx, buffer); bufferPos = 0; } if (bufferPos < WP512_BLOCK_SIZE - WP512_LENGTHBYTES) @@ -1086,31 +1051,32 @@ static int wp512_final(struct shash_desc *desc, u8 *out) bufferPos = WP512_BLOCK_SIZE - WP512_LENGTHBYTES; memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES], bitLength, WP512_LENGTHBYTES); - wp512_process_buffer(wctx); + wp512_process_buffer(wctx, buffer); + memzero_explicit(buffer, sizeof(buffer)); for (i = 0; i < WP512_DIGEST_SIZE/8; i++) digest[i] = cpu_to_be64(wctx->hash[i]); - wctx->bufferBits = bufferBits; - wctx->bufferPos = bufferPos; return 0; } -static int wp384_final(struct shash_desc *desc, u8 *out) +static int wp384_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { u8 D[64]; - wp512_final(desc, D); + wp512_finup(desc, src, len, D); memcpy(out, D, WP384_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } -static int wp256_final(struct shash_desc *desc, u8 *out) +static int wp256_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { u8 D[64]; - wp512_final(desc, D); + wp512_finup(desc, src, len, D); memcpy(out, D, WP256_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); @@ -1121,11 +1087,12 @@ static struct shash_alg wp_algs[3] = { { .digestsize = WP512_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, - .final = wp512_final, + .finup = wp512_finup, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp512", .cra_driver_name = "wp512-generic", + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -1133,11 +1100,12 @@ static struct shash_alg wp_algs[3] = { { .digestsize = WP384_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, - .final = wp384_final, + .finup = wp384_finup, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp384", .cra_driver_name = "wp384-generic", + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -1145,11 +1113,12 @@ static struct shash_alg wp_algs[3] = { { .digestsize = WP256_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, - .final = wp256_final, + .finup = wp256_finup, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp256", .cra_driver_name = "wp256-generic", + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } From b872f562c8cef59743993b48eb458c2d87c1651e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 Jun 2025 19:11:50 +0800 Subject: [PATCH 142/297] dm-crypt: Extend state buffer size in crypt_iv_lmk_one Add a macro CRYPTO_MD5_STATESIZE for the Crypto API export state size of md5 and use that in dm-crypt instead of relying on the size of struct md5_state (the latter is currently undergoing a transition and may shrink). This commit fixes a crash on 32-bit machines: Oops: Oops: 0000 [#1] SMP CPU: 1 UID: 0 PID: 12 Comm: kworker/u16:0 Not tainted 6.16.0-rc2+ #993 PREEMPT(full) Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Workqueue: kcryptd-254:0-1 kcryptd_crypt [dm_crypt] EIP: __crypto_shash_export+0xf/0x90 Code: 4a c1 c7 40 20 a0 b4 4a c1 81 cf 0e 00 04 08 89 78 50 e9 2b ff ff ff 8d 74 26 00 55 89 e5 57 56 53 89 c3 89 d6 8b 00 8b 40 14 <8b> 50 fc f6 40 13 01 74 04 4a 2b 50 14 85 c9 74 10 89 f2 89 d8 ff EAX: 303a3435 EBX: c3007c90 ECX: 00000000 EDX: c3007c38 ESI: c3007c38 EDI: c3007c90 EBP: c3007bfc ESP: c3007bf0 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010216 CR0: 80050033 CR2: 303a3431 CR3: 04fbe000 CR4: 00350e90 Call Trace: crypto_shash_export+0x65/0xc0 crypt_iv_lmk_one+0x106/0x1a0 [dm_crypt] Fixes: efd62c85525e ("crypto: md5-generic - Use API partial block handling") Reported-by: Milan Broz Signed-off-by: Herbert Xu Tested-by: Milan Broz Closes: https://lore.kernel.org/linux-crypto/f1625ddc-e82e-4b77-80c2-dc8e45b54848@gmail.com/T/ Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 11 +++++++---- include/crypto/hash.h | 2 ++ include/crypto/md5.h | 4 ++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9dfdb63220d7..17157c4216a5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -517,7 +517,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); - struct md5_state md5state; + union { + struct md5_state md5state; + u8 state[CRYPTO_MD5_STATESIZE]; + } u; __le32 buf[4]; int i, r; @@ -548,13 +551,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, return r; /* No MD5 padding here */ - r = crypto_shash_export(desc, &md5state); + r = crypto_shash_export(desc, &u.md5state); if (r) return r; for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&md5state.hash[i]); - memcpy(iv, &md5state.hash, cc->iv_size); + __cpu_to_le32s(&u.md5state.hash[i]); + memcpy(iv, &u.md5state.hash, cc->iv_size); return 0; } diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 6f6b9de12cd3..db294d452e8c 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -202,6 +202,8 @@ struct shash_desc { #define HASH_REQUEST_CLONE(name, gfp) \ hash_request_clone(name, sizeof(__##name##_req), gfp) +#define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1) + /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 198b5d69b92f..28ee533a0507 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -2,6 +2,7 @@ #ifndef _CRYPTO_MD5_H #define _CRYPTO_MD5_H +#include #include #define MD5_DIGEST_SIZE 16 @@ -15,6 +16,9 @@ #define MD5_H2 0x98badcfeUL #define MD5_H3 0x10325476UL +#define CRYPTO_MD5_STATESIZE \ + CRYPTO_HASH_STATESIZE(MD5_STATE_SIZE, MD5_HMAC_BLOCK_SIZE) + extern const u8 md5_zero_message_hash[MD5_DIGEST_SIZE]; struct md5_state { From 9205999e9f13a07cb29d5a8836c25afdca186007 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Wed, 18 Jun 2025 18:39:50 +0530 Subject: [PATCH 143/297] drm/i915/snps_hdmi_pll: Fix 64-bit divisor truncation by using div64_u64 DIV_ROUND_CLOSEST_ULL uses do_div(), which expects a 32-bit divisor. When passing a 64-bit constant like CURVE2_MULTIPLIER, the value is silently truncated to u32, potentially leading to incorrect results on large divisors. Replace DIV_ROUND_CLOSEST_ULL with DIV64_U64_ROUND_CLOSEST which correctly handles full 64-bit division. v2: Use DIV64_U64_ROUND_CLOSEST instead of div64_u64 macro. (Jani) Fixes: 5947642004bf ("drm/i915/display: Add support for SNPS PHY HDMI PLL algorithm for DG2") Reported-by: Vas Novikov Closes: https://lore.kernel.org/all/8d7c7958-9558-4c8a-a81a-e9310f2d8852@gmail.com/ Cc: Ankit Nautiyal Cc: Suraj Kandpal Cc: Jani Nikula Cc: Vas Novikov Cc: stable@vger.kernel.org # v6.15+ Reviewed-by: Jani Nikula Signed-off-by: Ankit Nautiyal Link: https://lore.kernel.org/r/20250618130951.1596587-2-ankit.k.nautiyal@intel.com (cherry picked from commit b300a175a11e6a934d728317dc39787723cc7917) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c b/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c index 74bb3bedf30f..5111bdc3075b 100644 --- a/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c +++ b/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c @@ -103,8 +103,8 @@ static void get_ana_cp_int_prop(u64 vco_clk, DIV_ROUND_DOWN_ULL(curve_1_interpolated, CURVE0_MULTIPLIER))); ana_cp_int_temp = - DIV_ROUND_CLOSEST_ULL(DIV_ROUND_DOWN_ULL(adjusted_vco_clk1, curve_2_scaled1), - CURVE2_MULTIPLIER); + DIV64_U64_ROUND_CLOSEST(DIV_ROUND_DOWN_ULL(adjusted_vco_clk1, curve_2_scaled1), + CURVE2_MULTIPLIER); *ana_cp_int = max(1, min(ana_cp_int_temp, 127)); From a24cc6ce1933eade12aa2b9859de0fcd2dac2c06 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 23 Jun 2025 10:34:08 +0200 Subject: [PATCH 144/297] futex: Initialize futex_phash_new during fork(). During a hash resize operation the new private hash is stored in mm_struct::futex_phash_new if the current hash can not be immediately replaced. The new hash must not be copied during fork() into the new task. Doing so will lead to a double-free of the memory by the two tasks. Initialize the mm_struct::futex_phash_new during fork(). Closes: https://lore.kernel.org/all/aFBQ8CBKmRzEqIfS@mozart.vkv.me/ Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash") Reported-by: Calvin Owens Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Calvin Owens Link: https://lkml.kernel.org/r/20250623083408.jTiJiC6_@linutronix.de --- include/linux/futex.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/futex.h b/include/linux/futex.h index 005b040c4791..b37193653e6b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; mutex_init(&mm->futex_hash_lock); } From a3ef3c2da675a8a564c8bea1a511cdd0a2a9aa49 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Thu, 5 Jun 2025 11:28:46 +0300 Subject: [PATCH 145/297] drm/dp: Change AUX DPCD probe address from DPCD_REV to LANE0_1_STATUS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading DPCD registers has side-effects in general. In particular accessing registers outside of the link training register range (0x102-0x106, 0x202-0x207, 0x200c-0x200f, 0x2216) is explicitly forbidden by the DP v2.1 Standard, see 3.6.5.1 DPTX AUX Transaction Handling Mandates 3.6.7.4 128b/132b DP Link Layer LTTPR Link Training Mandates Based on my tests, accessing the DPCD_REV register during the link training of an UHBR TBT DP tunnel sink leads to link training failures. Solve the above by using the DP_LANE0_1_STATUS (0x202) register for the DPCD register access quirk. Cc: Cc: Ville Syrjälä Cc: Jani Nikula Acked-by: Jani Nikula Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20250605082850.65136-2-imre.deak@intel.com (cherry picked from commit a40c5d727b8111b5db424a1e43e14a1dcce1e77f) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/display/drm_dp_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c index f2a6559a2710..dc622c78db9d 100644 --- a/drivers/gpu/drm/display/drm_dp_helper.c +++ b/drivers/gpu/drm/display/drm_dp_helper.c @@ -725,7 +725,7 @@ ssize_t drm_dp_dpcd_read(struct drm_dp_aux *aux, unsigned int offset, * monitor doesn't power down exactly after the throw away read. */ if (!aux->is_remote) { - ret = drm_dp_dpcd_probe(aux, DP_DPCD_REV); + ret = drm_dp_dpcd_probe(aux, DP_LANE0_1_STATUS); if (ret < 0) return ret; } From dc6458ed95e40146699f9c523e34cb13ff127170 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 23 Jun 2025 14:14:55 +0530 Subject: [PATCH 146/297] ASoC: amd: ps: fix for soundwire failures during hibernation exit sequence During the hibernate entry sequence, ACP registers will be reset to default values and acp ip will be completely powered off including acp SoundWire pads. During resume sequence, if acp SoundWire pad keeper enable register is not restored along with pad pulldown control register value, then SoundWire manager links won't be powered on correctly results in peripheral register access failures and completely audio function is broken. Add code to store the acp SoundWire pad keeper enable register and acp pad pulldown ctrl register values before entering into suspend state and restore the register values during resume sequence based on condition check for acp SoundWire pad keeper enable register for ACP6.3, ACP7.0 & ACP7.1 platforms. Fixes: 491628388005 ("ASoC: amd: ps: add callback functions for acp pci driver pm ops") Signed-off-by: Vijendar Mukunda Link: https://patch.msgid.link/20250623084630.3100279-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/ps/acp63.h | 4 ++++ sound/soc/amd/ps/ps-common.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/sound/soc/amd/ps/acp63.h b/sound/soc/amd/ps/acp63.h index 85feae45c44c..d7c994e26e4d 100644 --- a/sound/soc/amd/ps/acp63.h +++ b/sound/soc/amd/ps/acp63.h @@ -334,6 +334,8 @@ struct acp_hw_ops { * @addr: pci ioremap address * @reg_range: ACP reigister range * @acp_rev: ACP PCI revision id + * @acp_sw_pad_keeper_en: store acp SoundWire pad keeper enable register value + * @acp_pad_pulldown_ctrl: store acp pad pulldown control register value * @acp63_sdw0-dma_intr_stat: DMA interrupt status array for ACP6.3 platform SoundWire * manager-SW0 instance * @acp63_sdw_dma_intr_stat: DMA interrupt status array for ACP6.3 platform SoundWire @@ -367,6 +369,8 @@ struct acp63_dev_data { u32 addr; u32 reg_range; u32 acp_rev; + u32 acp_sw_pad_keeper_en; + u32 acp_pad_pulldown_ctrl; u16 acp63_sdw0_dma_intr_stat[ACP63_SDW0_DMA_MAX_STREAMS]; u16 acp63_sdw1_dma_intr_stat[ACP63_SDW1_DMA_MAX_STREAMS]; u16 acp70_sdw0_dma_intr_stat[ACP70_SDW0_DMA_MAX_STREAMS]; diff --git a/sound/soc/amd/ps/ps-common.c b/sound/soc/amd/ps/ps-common.c index 1c89fb5fe1da..7b4966b75dc6 100644 --- a/sound/soc/amd/ps/ps-common.c +++ b/sound/soc/amd/ps/ps-common.c @@ -160,6 +160,8 @@ static int __maybe_unused snd_acp63_suspend(struct device *dev) adata = dev_get_drvdata(dev); if (adata->is_sdw_dev) { + adata->acp_sw_pad_keeper_en = readl(adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + adata->acp_pad_pulldown_ctrl = readl(adata->acp63_base + ACP_PAD_PULLDOWN_CTRL); adata->sdw_en_stat = check_acp_sdw_enable_status(adata); if (adata->sdw_en_stat) { writel(1, adata->acp63_base + ACP_ZSC_DSP_CTRL); @@ -197,6 +199,7 @@ static int __maybe_unused snd_acp63_runtime_resume(struct device *dev) static int __maybe_unused snd_acp63_resume(struct device *dev) { struct acp63_dev_data *adata; + u32 acp_sw_pad_keeper_en; int ret; adata = dev_get_drvdata(dev); @@ -209,6 +212,12 @@ static int __maybe_unused snd_acp63_resume(struct device *dev) if (ret) dev_err(dev, "ACP init failed\n"); + acp_sw_pad_keeper_en = readl(adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + dev_dbg(dev, "ACP_SW0_PAD_KEEPER_EN:0x%x\n", acp_sw_pad_keeper_en); + if (!acp_sw_pad_keeper_en) { + writel(adata->acp_sw_pad_keeper_en, adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + writel(adata->acp_pad_pulldown_ctrl, adata->acp63_base + ACP_PAD_PULLDOWN_CTRL); + } return ret; } @@ -408,6 +417,8 @@ static int __maybe_unused snd_acp70_suspend(struct device *dev) adata = dev_get_drvdata(dev); if (adata->is_sdw_dev) { + adata->acp_sw_pad_keeper_en = readl(adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + adata->acp_pad_pulldown_ctrl = readl(adata->acp63_base + ACP_PAD_PULLDOWN_CTRL); adata->sdw_en_stat = check_acp_sdw_enable_status(adata); if (adata->sdw_en_stat) { writel(1, adata->acp63_base + ACP_ZSC_DSP_CTRL); @@ -445,6 +456,7 @@ static int __maybe_unused snd_acp70_runtime_resume(struct device *dev) static int __maybe_unused snd_acp70_resume(struct device *dev) { struct acp63_dev_data *adata; + u32 acp_sw_pad_keeper_en; int ret; adata = dev_get_drvdata(dev); @@ -459,6 +471,12 @@ static int __maybe_unused snd_acp70_resume(struct device *dev) if (ret) dev_err(dev, "ACP init failed\n"); + acp_sw_pad_keeper_en = readl(adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + dev_dbg(dev, "ACP_SW0_PAD_KEEPER_EN:0x%x\n", acp_sw_pad_keeper_en); + if (!acp_sw_pad_keeper_en) { + writel(adata->acp_sw_pad_keeper_en, adata->acp63_base + ACP_SW0_PAD_KEEPER_EN); + writel(adata->acp_pad_pulldown_ctrl, adata->acp63_base + ACP_PAD_PULLDOWN_CTRL); + } return ret; } From 95b6759a81833d0e8c7456430186c2f6d174764e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 15:09:53 +0200 Subject: [PATCH 147/297] net: qed: reduce stack usage for TLV processing clang gets a bit confused by the code in the qed_mfw_process_tlv_req and ends up spilling registers to the stack hundreds of times. When sanitizers are enabled, this can end up blowing the stack warning limit: drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c:1244:5: error: stack frame size (1824) exceeds limit (1280) in 'qed_mfw_process_tlv_req' [-Werror,-Wframe-larger-than] Apparently the problem is the complexity of qed_mfw_update_tlvs() after inlining, and marking the four main branches of that function as noinline_for_stack makes this problem completely go away, the stack usage goes down to 100 bytes. Signed-off-by: Arnd Bergmann Reviewed-by: Alexander Lobakin Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c index f55eed092f25..7d78f072b0a1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c @@ -242,7 +242,7 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group) } /* Returns size of the data buffer or, -1 in case TLV data is not available. */ -static int +static noinline_for_stack int qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_generic *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -304,7 +304,7 @@ qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv, return -1; } -static int +static noinline_for_stack int qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_eth *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -438,7 +438,7 @@ qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time, return QED_MFW_TLV_TIME_SIZE; } -static int +static noinline_for_stack int qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_fcoe *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -1073,7 +1073,7 @@ qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv, return -1; } -static int +static noinline_for_stack int qed_mfw_get_iscsi_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_iscsi *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) From ff8abbd248c1f52df0c321690b88454b13ff54b2 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 22 Jun 2025 14:13:40 -0300 Subject: [PATCH 148/297] smb: client: fix regression with native SMB symlinks Some users and customers reported that their backup/copy tools started to fail when the directory being copied contained symlink targets that the client couldn't parse - even when those symlinks weren't followed. Fix this by allowing lstat(2) and readlink(2) to succeed even when the client can't resolve the symlink target, restoring old behavior. Cc: linux-cifs@vger.kernel.org Cc: stable@vger.kernel.org Reported-by: Remy Monsen Closes: https://lore.kernel.org/r/CAN+tdP7y=jqw3pBndZAGjQv0ObFq8Q=+PUDHgB36HdEz9QA6FQ@mail.gmail.com Reported-by: Pierguido Lambri Fixes: 12b466eb52d9 ("cifs: Fix creating and resolving absolute NT-style symlinks") Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/reparse.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 511611206dab..1c40e42e4d89 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -875,15 +875,8 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, abs_path += sizeof("\\DosDevices\\")-1; else if (strstarts(abs_path, "\\GLOBAL??\\")) abs_path += sizeof("\\GLOBAL??\\")-1; - else { - /* Unhandled absolute symlink, points outside of DOS/Win32 */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; - } + else + goto out_unhandled_target; /* Sometimes path separator after \?? is double backslash */ if (abs_path[0] == '\\') @@ -910,13 +903,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, abs_path++; abs_path[0] = drive_letter; } else { - /* Unhandled absolute symlink. Report an error. */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; + goto out_unhandled_target; } abs_path_len = strlen(abs_path)+1; @@ -966,6 +953,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, * These paths have same format as Linux symlinks, so no * conversion is needed. */ +out_unhandled_target: linux_target = smb_target; smb_target = NULL; } From db53805156f1e0aa6d059c0d3f9ac660d4ef3eb4 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 10 Jun 2025 20:53:30 +0200 Subject: [PATCH 149/297] dm-raid: fix variable in journal device check Replace "rdev" with correct loop variable name "r". Signed-off-by: Heinz Mauelshagen Cc: stable@vger.kernel.org Fixes: 63c32ed4afc2 ("dm raid: add raid4/5/6 journaling support") Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index d296770478b2..e8c0a8c6fb51 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2407,7 +2407,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (test_bit(Journal, &rdev->flags) || + if (test_bit(Journal, &r->flags) || !r->sb_page) continue; sb2 = page_address(r->sb_page); From 88a80066af1617fab444776135d840467414beb6 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 23 Jun 2025 19:02:18 +0800 Subject: [PATCH 150/297] io_uring: make fallocate be hashed work Like ftruncate and write, fallocate operations on the same file cannot be executed in parallel, so it is better to make fallocate be hashed work. Signed-off-by: Fengnan Chang Link: https://lore.kernel.org/r/20250623110218.61490-1-changfengnan@bytedance.com Signed-off-by: Jens Axboe --- io_uring/opdef.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 6e0882b051f9..6de6229207a8 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, From 1d6123102e9fbedc8d25bf4731da6d513173e49e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 17 Jun 2025 09:58:13 -0700 Subject: [PATCH 151/297] Bluetooth: hci_core: Fix use-after-free in vhci_flush() syzbot reported use-after-free in vhci_flush() without repro. [0] From the splat, a thread close()d a vhci file descriptor while its device was being used by iotcl() on another thread. Once the last fd refcnt is released, vhci_release() calls hci_unregister_dev(), hci_free_dev(), and kfree() for struct vhci_data, which is set to hci_dev->dev->driver_data. The problem is that there is no synchronisation after unlinking hdev from hci_dev_list in hci_unregister_dev(). There might be another thread still accessing the hdev which was fetched before the unlink operation. We can use SRCU for such synchronisation. Let's run hci_dev_reset() under SRCU and wait for its completion in hci_unregister_dev(). Another option would be to restore hci_dev->destruct(), which was removed in commit 587ae086f6e4 ("Bluetooth: Remove unused hci-destruct cb"). However, this would not be a good solution, as we should not run hci_unregister_dev() while there are in-flight ioctl() requests, which could lead to another data-race KCSAN splat. Note that other drivers seem to have the same problem, for exmaple, virtbt_remove(). [0]: BUG: KASAN: slab-use-after-free in skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] BUG: KASAN: slab-use-after-free in skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 Read of size 8 at addr ffff88807cb8d858 by task syz.1.219/6718 CPU: 1 UID: 0 PID: 6718 Comm: syz.1.219 Not tainted 6.16.0-rc1-syzkaller-00196-g08207f42d3ff #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 skb_queue_purge include/linux/skbuff.h:3368 [inline] vhci_flush+0x44/0x50 drivers/bluetooth/hci_vhci.c:69 hci_dev_do_reset net/bluetooth/hci_core.c:552 [inline] hci_dev_reset+0x420/0x5c0 net/bluetooth/hci_core.c:592 sock_do_ioctl+0xd9/0x300 net/socket.c:1190 sock_ioctl+0x576/0x790 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcf5b98e929 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fcf5c7b9038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcf5bbb6160 RCX: 00007fcf5b98e929 RDX: 0000000000000000 RSI: 00000000400448cb RDI: 0000000000000009 RBP: 00007fcf5ba10b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007fcf5bbb6160 R15: 00007ffd6353d528 Allocated by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] vhci_open+0x57/0x360 drivers/bluetooth/hci_vhci.c:635 misc_open+0x2bc/0x330 drivers/char/misc.c:161 chrdev_open+0x4c9/0x5e0 fs/char_dev.c:414 do_dentry_open+0xdf0/0x1970 fs/open.c:964 vfs_open+0x3b/0x340 fs/open.c:1094 do_open fs/namei.c:3887 [inline] path_openat+0x2ee5/0x3830 fs/namei.c:4046 do_filp_open+0x1fa/0x410 fs/namei.c:4073 do_sys_openat2+0x121/0x1c0 fs/open.c:1437 do_sys_open fs/open.c:1452 [inline] __do_sys_openat fs/open.c:1468 [inline] __se_sys_openat fs/open.c:1463 [inline] __x64_sys_openat+0x138/0x170 fs/open.c:1463 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 vhci_release+0xbc/0xd0 drivers/bluetooth/hci_vhci.c:671 __fput+0x44c/0xa70 fs/file_table.c:465 task_work_run+0x1d1/0x260 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0x6ad/0x22e0 kernel/exit.c:955 do_group_exit+0x21c/0x2d0 kernel/exit.c:1104 __do_sys_exit_group kernel/exit.c:1115 [inline] __se_sys_exit_group kernel/exit.c:1113 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1113 x64_sys_call+0x21ba/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff88807cb8d800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 88 bytes inside of freed 1024-byte region [ffff88807cb8d800, ffff88807cb8dc00) Fixes: bf18c7118cf8 ("Bluetooth: vhci: Free driver_data on file release") Reported-by: syzbot+2faa4825e556199361f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f62d64848fc4c7c30cd6 Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a760f05fa3fb..9fc8f544e20e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -347,6 +348,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 07a8b4281a39..14d7221b8ac0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ -struct hci_dev *hci_dev_get(int index) +static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; @@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index) list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); + if (srcu_index) + *srcu_index = srcu_read_lock(&d->srcu); break; } } @@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index) return hdev; } +struct hci_dev *hci_dev_get(int index) +{ + return __hci_dev_get(index, NULL); +} + +static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) +{ + return __hci_dev_get(index, srcu_index); +} + +static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) +{ + srcu_read_unlock(&hdev->srcu, srcu_index); + hci_dev_put(hdev); +} + /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) @@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; - int err; + int err, srcu_index; - hdev = hci_dev_get(dev); + hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; @@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev) err = hci_dev_do_reset(hdev); done: - hci_dev_put(hdev); + hci_dev_put_srcu(hdev, srcu_index); return err; } @@ -2433,6 +2451,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) if (!hdev) return NULL; + if (init_srcu_struct(&hdev->srcu)) { + kfree(hdev); + return NULL; + } + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); @@ -2678,6 +2701,9 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + synchronize_srcu(&hdev->srcu); + cleanup_srcu_struct(&hdev->srcu); + disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); From 9a07ca9a4015f8f71e2b594ee76ac55483babd89 Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Mon, 23 Jun 2025 14:30:23 +0800 Subject: [PATCH 152/297] ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook 6 G1a HP EliteBook 6 G1a laptops use ALC236 codec and need the fixup ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF to make the mic/micmute LEDs work. Signed-off-by: Chris Chiu Link: https://patch.msgid.link/20250623063023.374920-1-chris.chiu@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e144ebb08841..166730f00e5e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10909,7 +10909,9 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8def, "HP EliteBook 660 G12", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8df0, "HP EliteBook 630 G12", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8df1, "HP EliteBook 630 G12", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8dfb, "HP EliteBook 6 G1a 14", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8dfc, "HP EliteBook 645 G12", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8dfd, "HP EliteBook 6 G1a 16", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8dfe, "HP EliteBook 665 G12", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8e11, "HP Trekker", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e12, "HP Trekker", ALC287_FIXUP_CS35L41_I2C_2), From fb4e2a6e8f28a3c0ad382e363aeb9cd822007b8a Mon Sep 17 00:00:00 2001 From: Youngjun Lee Date: Mon, 23 Jun 2025 20:05:25 +0900 Subject: [PATCH 153/297] ALSA: usb-audio: Fix out-of-bounds read in snd_usb_get_audioformat_uac3() In snd_usb_get_audioformat_uac3(), the length value returned from snd_usb_ctl_msg() is used directly for memory allocation without validation. This length is controlled by the USB device. The allocated buffer is cast to a uac3_cluster_header_descriptor and its fields are accessed without verifying that the buffer is large enough. If the device returns a smaller than expected length, this leads to an out-of-bounds read. Add a length check to ensure the buffer is large enough for uac3_cluster_header_descriptor. Signed-off-by: Youngjun Lee Fixes: 9a2fe9b801f5 ("ALSA: usb: initial USB Audio Device Class 3.0 support") Link: https://patch.msgid.link/20250623-uac3-oob-fix-v1-1-527303eaf40a@samsung.com Signed-off-by: Takashi Iwai --- sound/usb/stream.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/stream.c b/sound/usb/stream.c index c1ea8844a46f..aa91d63749f2 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -987,6 +987,8 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip, * and request Cluster Descriptor */ wLength = le16_to_cpu(hc_header.wLength); + if (wLength < sizeof(cluster)) + return NULL; cluster = kzalloc(wLength, GFP_KERNEL); if (!cluster) return ERR_PTR(-ENOMEM); From 41c66461cb2e8d3934a5395f27e572ebe63696b4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jun 2025 17:18:39 +0200 Subject: [PATCH 154/297] ALSA: hda/realtek: Add mic-mute LED setup for ASUS UM5606 ASUS UM5606* models use the quirk to set up the bass speakers, but it missed the mic-mute LED configuration. Other similar models have the AMD ACP dmic, and the mic-mute is set up for that, but those models don't have AMD ACP but rather built-in mics of Realtek codec, hence the Realtek driver should set it up, instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220125 Link: https://patch.msgid.link/20250623151841.28810-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 166730f00e5e..784644a2b51d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6610,6 +6610,7 @@ static void alc294_fixup_bass_speaker_15(struct hda_codec *codec, if (action == HDA_FIXUP_ACT_PRE_PROBE) { static const hda_nid_t conn[] = { 0x02, 0x03 }; snd_hda_override_conn_list(codec, 0x15, ARRAY_SIZE(conn), conn); + snd_hda_gen_add_micmute_led_cdev(codec, NULL); } } From 5aa326a6a2ff0a7e7c6e11777045e66704c2d5e4 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sun, 8 Jun 2025 09:03:05 +0530 Subject: [PATCH 155/297] PCI/PTM: Build debugfs code only if CONFIG_DEBUG_FS is enabled Otherwise, the following build error will happen for CONFIG_DEBUG_FS=n && CONFIG_PCIE_PTM=y: drivers/pci/pcie/ptm.c:498:25: error: redefinition of 'pcie_ptm_create_debugfs' 498 | struct pci_ptm_debugfs *pcie_ptm_create_debugfs(struct device *dev, void *pdata, | ^ ./include/linux/pci.h:1915:2: note: previous definition is here 1915 | *pcie_ptm_create_debugfs(struct device *dev, void *pdata, | ^ drivers/pci/pcie/ptm.c:546:6: error: redefinition of 'pcie_ptm_destroy_debugfs' 546 | void pcie_ptm_destroy_debugfs(struct pci_ptm_debugfs *ptm_debugfs) | ^ ./include/linux/pci.h:1918:1: note: previous definition is here 1918 | pcie_ptm_destroy_debugfs(struct pci_ptm_debugfs *ptm_debugfs) { } | Fixes: 132833405e61 ("PCI: Add debugfs support for exposing PTM context") Reported-by: Eric Biggers Closes: https://lore.kernel.org/linux-pci/20250607025506.GA16607@sol Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Tested-by: Eric Biggers Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250608033305.15214-1-manivannan.sadhasivam@linaro.org --- drivers/pci/pcie/ptm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index ee5f615a9023..4bd73f038ffb 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -254,6 +254,7 @@ bool pcie_ptm_enabled(struct pci_dev *dev) } EXPORT_SYMBOL(pcie_ptm_enabled); +#if IS_ENABLED(CONFIG_DEBUG_FS) static ssize_t context_update_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -552,3 +553,4 @@ void pcie_ptm_destroy_debugfs(struct pci_ptm_debugfs *ptm_debugfs) debugfs_remove_recursive(ptm_debugfs->debugfs); } EXPORT_SYMBOL_GPL(pcie_ptm_destroy_debugfs); +#endif From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 17 Jun 2025 00:09:51 -0400 Subject: [PATCH 156/297] replace collect_mounts()/drop_collected_mounts() with a safer variant collect_mounts() has several problems - one can't iterate over the results directly, so it has to be done with callback passed to iterate_mounts(); it has an oopsable race with d_invalidate(); it creates temporary clones of mounts invisibly for sync umount (IOW, you can have non-lazy umount succeed leaving filesystem not mounted anywhere and yet still busy). A saner approach is to give caller an array of struct path that would pin every mount in a subtree, without cloning any mounts. * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or a pointer to array of struct path, one for each chunk of tree visible under 'where' (i.e. the first element is a copy of where, followed by (mount,root) for everything mounted under it - the same set collect_mounts() would give). Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning references to the roots of subtrees in the caller's namespace. Array is terminated by {NULL, NULL} struct path. If it fits into preallocated array (on-stack, normally), that's where it goes; otherwise it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated' is ignored (and expected to be NULL). * drop_collected_paths(paths, preallocated) is given the array returned by an earlier call of collect_paths() and the preallocated array passed to that call. All mount/dentry references are dropped and array is kfree'd if it's not equal to 'preallocated'. * instead of iterate_mounts(), users should just iterate over array of struct path - nothing exotic is needed for that. Existing users (all in audit_tree.c) are converted. [folded a fix for braino reported by Venkat Rao Bagalkote ] Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry") Tested-by: Venkat Rao Bagalkote Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 9 +++ fs/namespace.c | 99 ++++++++++++++++----------- fs/pnode.h | 2 - include/linux/mount.h | 6 +- kernel/audit_tree.c | 63 +++++++++-------- 5 files changed, 105 insertions(+), 74 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 3616d7161dab..a5734bdd1cc7 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1249,3 +1249,12 @@ Using try_lookup_noperm() will require linux/namei.h to be included. Calling conventions for ->d_automount() have changed; we should *not* grab an extra reference to new mount - it should be returned with refcount 1. + +--- + +collect_mounts()/drop_collected_mounts()/iterate_mounts() are gone now. +Replacement is collect_paths()/drop_collected_path(), with no special +iterator needed. Instead of a cloned mount tree, the new interface returns +an array of struct path, one for each mount collect_mounts() would've +created. These struct path point to locations in the caller's namespace +that would be roots of the cloned mounts. diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..14601ec4c2c5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2310,21 +2310,62 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; } -/* Caller should check returned pointer for errors */ - -struct vfsmount *collect_mounts(const struct path *path) +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct path *p; + + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} + +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) +{ + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } static void free_mnt_ns(struct mnt_namespace *); @@ -2401,15 +2442,6 @@ void dissolve_on_fput(struct vfsmount *mnt) free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2511,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -6262,7 +6279,11 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); + namespace_lock(); + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); free_mnt_ns(ns); } diff --git a/fs/pnode.h b/fs/pnode.h index 34b6247af01d..2d026fb98b18 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -28,8 +28,6 @@ #define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 -#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) - static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; diff --git a/include/linux/mount.h b/include/linux/mount.h index 4880f434c021..1a508beba446 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -116,10 +116,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(const struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); +extern struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } From ce7df19686530920f2f6b636e71ce5eb1d9303ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Jun 2025 18:03:29 -0400 Subject: [PATCH 157/297] attach_recursive_mnt(): do not lock the covering tree when sliding something under it If we are propagating across the userns boundary, we need to lock the mounts added there. However, in case when something has already been mounted there and we end up sliding a new tree under that, the stuff that had been there before should not get locked. IOW, lock_mnt_tree() should be called before we reparent the preexisting tree on top of what we are adding. Fixes: 3bd045cc9c4b ("separate copying and locking mount tree on cross-userns copies") Signed-off-by: Al Viro --- fs/namespace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 14601ec4c2c5..eed83254492f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2768,14 +2768,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); commit_tree(child); } put_mountpoint(smp); From aa485e8789d56a4573f7c8d000a182b749eaa64d Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 18 Jun 2025 09:19:33 +0800 Subject: [PATCH 158/297] libbpf: Fix null pointer dereference in btf_dump__free on allocation failure When btf_dump__new() fails to allocate memory for the internal hashmap (btf_dump->type_names), it returns an error code. However, the cleanup function btf_dump__free() does not check if btf_dump->type_names is NULL before attempting to free it. This leads to a null pointer dereference when btf_dump__free() is called on a btf_dump object. Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Signed-off-by: Yuan Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250618011933.11423-1-chenyuan_fl@163.com --- tools/lib/bpf/btf_dump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 460c3e57fadb..0381f209920a 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -226,6 +226,9 @@ static void btf_dump_free_names(struct hashmap *map) size_t bkt; struct hashmap_entry *cur; + if (!map) + return; + hashmap__for_each_entry(map, cur, bkt) free((void *)cur->pkey); From f5990207026987a353d5a95204c4d9cb725637fd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 20 Jun 2025 11:48:55 -0700 Subject: [PATCH 159/297] net: netpoll: Initialize UDP checksum field before checksumming commit f1fce08e63fe ("netpoll: Eliminate redundant assignment") removed the initialization of the UDP checksum, which was wrong and broke netpoll IPv6 transmission due to bad checksumming. udph->check needs to be set before calling csum_ipv6_magic(). Fixes: f1fce08e63fe ("netpoll: Eliminate redundant assignment") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250620-netpoll_fix-v1-1-f9f0b82bc059@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4ddb7490df4b..6ad84d4a2b46 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -432,6 +432,7 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->dest = htons(np->remote_port); udph->len = htons(udp_len); + udph->check = 0; if (np->ipv6) { udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, @@ -460,7 +461,6 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { - udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, From 2b8be57fa0c88ac824a906f29c04d728f9f6047a Mon Sep 17 00:00:00 2001 From: Zhe Qiao Date: Thu, 19 Jun 2025 15:26:08 +0800 Subject: [PATCH 160/297] Revert "PCI/ACPI: Fix allocated memory release on error in pci_acpi_scan_root()" This reverts commit 631b2af2f357 ("PCI/ACPI: Fix allocated memory release on error in pci_acpi_scan_root()"). The reverted patch causes the 'ri->cfg' and 'root_ops' resources to be released multiple times. When acpi_pci_root_create() fails, these resources have already been released internally by the __acpi_pci_root_release_info() function. Releasing them again in pci_acpi_scan_root() leads to incorrect behavior and potential memory issues. We plan to resolve the issue using a more appropriate fix. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aEmdnuw715btq7Q5@stanley.mountain/ Signed-off-by: Zhe Qiao Acked-by: Dan Carpenter Link: https://patch.msgid.link/20250619072608.2075475-1-qiaozhe@iscas.ac.cn Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci-acpi.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index b78e0e417324..af370628e583 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1676,19 +1676,24 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) - goto free_ri; + if (!root_ops) { + kfree(ri); + return NULL; + } ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) - goto free_root_ops; + if (!ri->cfg) { + kfree(ri); + kfree(root_ops); + return NULL; + } root_ops->release_info = pci_acpi_generic_release_info; root_ops->prepare_resources = pci_acpi_root_prepare_resources; root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) - goto free_cfg; + return NULL; /* If we must preserve the resource configuration, claim now */ host = pci_find_host_bridge(bus); @@ -1705,14 +1710,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) pcie_bus_configure_settings(child); return bus; - -free_cfg: - pci_ecam_free(ri->cfg); -free_root_ops: - kfree(root_ops); -free_ri: - kfree(ri); - return NULL; } void pcibios_add_bus(struct pci_bus *bus) From 10357824151262636fda879845f8b64553541106 Mon Sep 17 00:00:00 2001 From: Chaoyi Chen Date: Fri, 20 Jun 2025 09:16:16 +0800 Subject: [PATCH 161/297] drm/bridge-connector: Fix bridge in drm_connector_hdmi_audio_init() The bridge used in drm_connector_hdmi_audio_init() does not correctly point to the required audio bridge, which lead to incorrect audio configuration input. Fixes: 231adeda9f67 ("drm/bridge-connector: hook DisplayPort audio support") Signed-off-by: Chaoyi Chen Reviewed-by: Dmitry Baryshkov Tested-by: Stephan Gerhold Link: https://lore.kernel.org/r/20250620011616.118-1-kernel@airkyi.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/display/drm_bridge_connector.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/display/drm_bridge_connector.c b/drivers/gpu/drm/display/drm_bridge_connector.c index 7d2e499ea5de..262e93e07a28 100644 --- a/drivers/gpu/drm/display/drm_bridge_connector.c +++ b/drivers/gpu/drm/display/drm_bridge_connector.c @@ -708,11 +708,14 @@ struct drm_connector *drm_bridge_connector_init(struct drm_device *drm, if (bridge_connector->bridge_hdmi_audio || bridge_connector->bridge_dp_audio) { struct device *dev; + struct drm_bridge *bridge; if (bridge_connector->bridge_hdmi_audio) - dev = bridge_connector->bridge_hdmi_audio->hdmi_audio_dev; + bridge = bridge_connector->bridge_hdmi_audio; else - dev = bridge_connector->bridge_dp_audio->hdmi_audio_dev; + bridge = bridge_connector->bridge_dp_audio; + + dev = bridge->hdmi_audio_dev; ret = drm_connector_hdmi_audio_init(connector, dev, &drm_bridge_connector_hdmi_audio_funcs, From 2f73c62d4e13df67380ff6faca39eec2bf08dd93 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 20 Jun 2025 13:09:39 +0200 Subject: [PATCH 162/297] Revert "riscv: misaligned: fix sleeping function called during misaligned access handling" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 61a74ad25462 ("riscv: misaligned: fix sleeping function called during misaligned access handling"). The commit addresses a sleeping in atomic context problem, but it is not the correct fix as explained by Clément: "Using nofault would lead to failure to read from user memory that is paged out for instance. This is not really acceptable, we should handle user misaligned access even at an address that would generate a page fault." This bug has been properly fixed by commit 453805f0a28f ("riscv: misaligned: enable IRQs while handling misaligned accesses"). Revert this improper fix. Link: https://lore.kernel.org/linux-riscv/b779beed-e44e-4a5e-9551-4647682b0d21@rivosinc.com/ Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Clément Léger Reviewed-by: Alexandre Ghiti Fixes: 61a74ad25462 ("riscv: misaligned: fix sleeping function called during misaligned access handling") Link: https://lore.kernel.org/r/20250620110939.1642735-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index dd8e4af6583f..93043924fe6c 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -454,7 +454,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) val.data_u64 = 0; if (user_mode(regs)) { - if (copy_from_user_nofault(&val, (u8 __user *)addr, len)) + if (copy_from_user(&val, (u8 __user *)addr, len)) return -1; } else { memcpy(&val, (u8 *)addr, len); @@ -555,7 +555,7 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs) return -EOPNOTSUPP; if (user_mode(regs)) { - if (copy_to_user_nofault((u8 __user *)addr, &val, len)) + if (copy_to_user((u8 __user *)addr, &val, len)) return -1; } else { memcpy((u8 *)addr, &val, len); From b0843f836126c980fb31923563c86ec52e8b9514 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Jun 2025 12:08:11 +0000 Subject: [PATCH 163/297] riscv: Fix sparse warning in vendor_extensions/sifive.c sparse reports the following warning: arch/riscv/kernel/vendor_extensions/sifive.c:11:33: sparse: sparse: symbol 'riscv_isa_vendor_ext_sifive' was not declared. Should it be static? So as this struct is only used in this file, make it static. Fixes: 2d147d77ae6e ("riscv: Add SiFive xsfvqmaccdod and xsfvqmaccqoq vendor extensions") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505072100.TZlEp8h1-lkp@intel.com/ Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250620-dev-alex-fix_sparse_sifive_v1-v1-1-efa3a6f93846@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vendor_extensions/sifive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c index 1411337dc1e6..8fcf67e8c07f 100644 --- a/arch/riscv/kernel/vendor_extensions/sifive.c +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -8,7 +8,7 @@ #include /* All SiFive vendor extensions supported in Linux */ -const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { +static const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { __RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF), __RISCV_ISA_EXT_DATA(xsfvfwmaccqqq, RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ), __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), From 890ba5be6335dbbbc99af14ea007befb5f83f174 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 19 Jun 2025 17:58:58 +0200 Subject: [PATCH 164/297] Revert "riscv: Define TASK_SIZE_MAX for __access_ok()" This reverts commit ad5643cf2f69 ("riscv: Define TASK_SIZE_MAX for __access_ok()"). This commit changes TASK_SIZE_MAX to be LONG_MAX to optimize access_ok(), because the previous TASK_SIZE_MAX (default to TASK_SIZE) requires some computation. The reasoning was that all user addresses are less than LONG_MAX, and all kernel addresses are greater than LONG_MAX. Therefore access_ok() can filter kernel addresses. Addresses between TASK_SIZE and LONG_MAX are not valid user addresses, but access_ok() let them pass. That was thought to be okay, because they are not valid addresses at hardware level. Unfortunately, one case is missed: get_user_pages_fast() happily accepts addresses between TASK_SIZE and LONG_MAX. futex(), for instance, uses get_user_pages_fast(). This causes the problem reported by Robert [1]. Therefore, revert this commit. TASK_SIZE_MAX is changed to the default: TASK_SIZE. This unfortunately reduces performance, because TASK_SIZE is more expensive to compute compared to LONG_MAX. But correctness first, we can think about optimization later, if required. Reported-by: Closes: https://lore.kernel.org/linux-riscv/77605.1750245028@localhost/ Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Alexandre Ghiti Fixes: ad5643cf2f69 ("riscv: Define TASK_SIZE_MAX for __access_ok()") Link: https://lore.kernel.org/r/20250619155858.1249789-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a11816bbf9e7..521a995741cd 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1075,7 +1075,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) */ #ifdef CONFIG_64BIT #define TASK_SIZE_64 (PGDIR_SIZE * PTRS_PER_PGD / 2) -#define TASK_SIZE_MAX LONG_MAX #ifdef CONFIG_COMPAT #define TASK_SIZE_32 (_AC(0x80000000, UL) - PAGE_SIZE) From c5136add3f9b4c23b8bbe5f4d722c95d4cfb936e Mon Sep 17 00:00:00 2001 From: Klara Modin Date: Tue, 17 Jun 2025 14:58:47 +0200 Subject: [PATCH 165/297] riscv: export boot_cpu_hartid The mailbox controller driver for the Microchip Inter-processor Communication can be built as a module. It uses cpuid_to_hartid_map and commit 4783ce32b080 ("riscv: export __cpuid_to_hartid_map") enables that to work for SMP. However, cpuid_to_hartid_map uses boot_cpu_hartid on non-SMP kernels and this driver can be useful in such configurations[1]. Export boot_cpu_hartid so the driver can be built as a module on non-SMP kernels as well. Link: https://lore.kernel.org/lkml/20250617-confess-reimburse-876101e099cb@spud/ [1] Cc: stable@vger.kernel.org Fixes: e4b1d67e7141 ("mailbox: add Microchip IPC support") Signed-off-by: Klara Modin Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250617125847.23829-1-klarasmodin@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f7c9a1caa83e..14888e5ea19a 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -50,6 +50,7 @@ atomic_t hart_lottery __section(".sdata") #endif ; unsigned long boot_cpu_hartid; +EXPORT_SYMBOL_GPL(boot_cpu_hartid); /* * Place kernel memory regions on the resource tree so that From d5e3241c5a386a2425823c8c7afb77a465bd040f Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Thu, 19 Jun 2025 11:45:30 +0200 Subject: [PATCH 166/297] ethernet: ionic: Fix DMA mapping tests Change error values of `ionic_tx_map_single()` and `ionic_tx_map_frag()` from 0 to `DMA_MAPPING_ERROR` to prevent collision with 0 as a valid address. This also fixes the use of `dma_mapping_error()` to test against 0 in `ionic_xdp_post_frame()` Fixes: 0f3154e6bcb3 ("ionic: Add Tx and Rx handling") Fixes: 56e41ee12d2d ("ionic: better dma-map error handling") Fixes: ac8813c0ab7d ("ionic: convert Rx queue buffers to use page_pool") Signed-off-by: Thomas Fourier Reviewed-by: Brett Creeley Link: https://patch.msgid.link/20250619094538.283723-2-fourier.thomas@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 2ac59564ded1..d10b58ebf603 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -321,7 +321,7 @@ static int ionic_xdp_post_frame(struct ionic_queue *q, struct xdp_frame *frame, len, DMA_TO_DEVICE); } else /* XDP_REDIRECT */ { dma_addr = ionic_tx_map_single(q, frame->data, len); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) return -EIO; } @@ -357,7 +357,7 @@ static int ionic_xdp_post_frame(struct ionic_queue *q, struct xdp_frame *frame, } else { dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag)); - if (dma_mapping_error(q->dev, dma_addr)) { + if (dma_addr == DMA_MAPPING_ERROR) { ionic_tx_desc_unmap_bufs(q, desc_info); return -EIO; } @@ -1083,7 +1083,7 @@ static dma_addr_t ionic_tx_map_single(struct ionic_queue *q, net_warn_ratelimited("%s: DMA single map failed on %s!\n", dev_name(dev), q->name); q_to_tx_stats(q)->dma_map_err++; - return 0; + return DMA_MAPPING_ERROR; } return dma_addr; } @@ -1100,7 +1100,7 @@ static dma_addr_t ionic_tx_map_frag(struct ionic_queue *q, net_warn_ratelimited("%s: DMA frag map failed on %s!\n", dev_name(dev), q->name); q_to_tx_stats(q)->dma_map_err++; - return 0; + return DMA_MAPPING_ERROR; } return dma_addr; } @@ -1116,7 +1116,7 @@ static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb, int frag_idx; dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb)); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) return -EIO; buf_info->dma_addr = dma_addr; buf_info->len = skb_headlen(skb); @@ -1126,7 +1126,7 @@ static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb, nfrags = skb_shinfo(skb)->nr_frags; for (frag_idx = 0; frag_idx < nfrags; frag_idx++, frag++) { dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag)); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) goto dma_fail; buf_info->dma_addr = dma_addr; buf_info->len = skb_frag_size(frag); From 7544f3f5b0b58c396f374d060898b5939da31709 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 19 Jun 2025 21:22:28 +0300 Subject: [PATCH 167/297] bridge: mcast: Fix use-after-free during router port configuration The bridge maintains a global list of ports behind which a multicast router resides. The list is consulted during forwarding to ensure multicast packets are forwarded to these ports even if the ports are not member in the matching MDB entry. When per-VLAN multicast snooping is enabled, the per-port multicast context is disabled on each port and the port is removed from the global router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 # ip link add name dummy1 up master br1 type dummy # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 1 $ bridge -d mdb show | grep router However, the port can be re-added to the global list even when per-VLAN multicast snooping is enabled: # ip link set dev dummy1 type bridge_slave mcast_router 0 # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 Since commit 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions"), when per-VLAN multicast snooping is enabled, multicast disablement on a port will disable the per-{port, VLAN} multicast contexts and not the per-port one. As a result, a port will remain in the global router port list even after it is deleted. This will lead to a use-after-free [1] when the list is traversed (when adding a new port to the list, for example): # ip link del dev dummy1 # ip link add name dummy2 up master br1 type dummy # ip link set dev dummy2 type bridge_slave mcast_router 2 Similarly, stale entries can also be found in the per-VLAN router port list. When per-VLAN multicast snooping is disabled, the per-{port, VLAN} contexts are disabled on each port and the port is removed from the per-VLAN router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 # ip link add name dummy1 up master br1 type dummy # bridge vlan add vid 2 dev dummy1 # bridge vlan global set vid 2 dev br1 mcast_snooping 1 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 0 $ bridge vlan global show dev br1 vid 2 | grep router However, the port can be re-added to the per-VLAN list even when per-VLAN multicast snooping is disabled: # bridge vlan set vid 2 dev dummy1 mcast_router 0 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 When the VLAN is deleted from the port, the per-{port, VLAN} multicast context will not be disabled since multicast snooping is not enabled on the VLAN. As a result, the port will remain in the per-VLAN router port list even after it is no longer member in the VLAN. This will lead to a use-after-free [2] when the list is traversed (when adding a new port to the list, for example): # ip link add name dummy2 up master br1 type dummy # bridge vlan add vid 2 dev dummy2 # bridge vlan del vid 2 dev dummy1 # bridge vlan set vid 2 dev dummy2 mcast_router 2 Fix these issues by removing the port from the relevant (global or per-VLAN) router port list in br_multicast_port_ctx_deinit(). The function is invoked during port deletion with the per-port multicast context and during VLAN deletion with the per-{port, VLAN} multicast context. Note that deleting the multicast router timer is not enough as it only takes care of the temporary multicast router states (1 or 3) and not the permanent one (2). [1] BUG: KASAN: slab-out-of-bounds in br_multicast_add_router.part.0+0x3f1/0x560 Write of size 8 at addr ffff888004a67328 by task ip/384 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x3f1/0x560 br_multicast_set_port_router+0x74e/0xac0 br_setport+0xa55/0x1870 br_port_slave_changelink+0x95/0x120 __rtnl_newlink+0x5e8/0xa40 rtnl_newlink+0x627/0xb00 rtnetlink_rcv_msg+0x6fb/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] BUG: KASAN: slab-use-after-free in br_multicast_add_router.part.0+0x378/0x560 Read of size 8 at addr ffff888009f00840 by task bridge/391 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x378/0x560 br_multicast_set_port_router+0x6f9/0xac0 br_vlan_process_options+0x8b6/0x1430 br_vlan_rtm_process_one+0x605/0xa30 br_vlan_rtm_process+0x396/0x4c0 rtnetlink_rcv_msg+0x2f7/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 2796d846d74a ("net: bridge: vlan: convert mcast router global option to per-vlan entry") Fixes: 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions") Reported-by: syzbot+7bfa4b72c6a5da128d32@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/684c18bd.a00a0220.279073.000b.GAE@google.com/T/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250619182228.1656906-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 0224ef3dfec0..1377f31b719c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2015,10 +2015,19 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { + struct net_bridge *br = pmctx->port->br; + bool del = false; + #if IS_ENABLED(CONFIG_IPV6) timer_delete_sync(&pmctx->ip6_mc_router_timer); #endif timer_delete_sync(&pmctx->ip4_mc_router_timer); + + spin_lock_bh(&br->multicast_lock); + del |= br_ip6_multicast_rport_del(pmctx); + del |= br_ip4_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); + spin_unlock_bh(&br->multicast_lock); } int br_multicast_add_port(struct net_bridge_port *port) From 2eb7648558a7911f2208e8940cd22ca40e93cc76 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:02 +0200 Subject: [PATCH 168/297] bpf: Specify access type of bpf_sysctl_get_name args The second argument of bpf_sysctl_get_name() helper is a pointer to a buffer that is being written to. However that isn't specify in the prototype. Until commit 37cce22dbd51a ("bpf: verifier: Refactor helper access type tracking"), all helper accesses were considered as a possible write access by the verifier, so no big harm was done. However, since then, the verifier might make wrong asssumption about the content of that address which might lead it to make faulty optimizations (such as removing code that was wrongly labeled dead). This is what happens in test_sysctl selftest to the tests related to sysctl_get_name. Add MEM_WRITE flag the second argument of bpf_sysctl_get_name(). Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-2-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; From b8a205486ed5c0c5c0386e472157a81ce686af25 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:03 +0200 Subject: [PATCH 169/297] selftests/bpf: Convert test_sysctl to prog_tests Convert test_sysctl test to prog_tests with minimal change to the tests themselves. Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-3-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 5 +-- .../bpf/{ => prog_tests}/test_sysctl.c | 37 ++++--------------- 3 files changed, 10 insertions(+), 33 deletions(-) rename tools/testing/selftests/bpf/{ => prog_tests}/test_sysctl.c (98%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index e2a2c46c008b..3d8378972d26 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -21,7 +21,6 @@ test_lirc_mode2_user flow_dissector_load test_tcpnotify_user test_libbpf -test_sysctl xdping test_cpp *.d diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cf5ed3bee573..910d8d6402ef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -73,7 +73,7 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \ test_sockmap \ - test_tcpnotify_user test_sysctl \ + test_tcpnotify_user \ test_progs-no_alu32 TEST_INST_SUBDIRS := no_alu32 @@ -220,7 +220,7 @@ ifeq ($(VMLINUX_BTF),) $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") endif -# Define simple and short `make test_progs`, `make test_sysctl`, etc targets +# Define simple and short `make test_progs`, `make test_maps`, etc targets # to build individual tests. # NOTE: Semicolon at the end is critical to override lib.mk's default static # rule for binaries. @@ -329,7 +329,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS) $(OUTPUT)/xdping: $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c similarity index 98% rename from tools/testing/selftests/bpf/test_sysctl.c rename to tools/testing/selftests/bpf/prog_tests/test_sysctl.c index bcdbd27f22f0..273dd41ca09e 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c @@ -1,22 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include "bpf_util.h" +#include "test_progs.h" #include "cgroup_helpers.h" -#include "testing_helpers.h" #define CG_PATH "/foo" #define MAX_INSNS 512 @@ -1608,26 +1594,19 @@ static int run_tests(int cgfd) return fails ? -1 : 0; } -int main(int argc, char **argv) +void test_sysctl(void) { - int cgfd = -1; - int err = 0; + int cgfd; cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; + if (!ASSERT_OK_FD(cgfd < 0, "create_cgroup")) + goto out; - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + if (!ASSERT_OK(run_tests(cgfd), "run_tests")) + goto out; - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; out: close(cgfd); cleanup_cgroup_environment(); - return err; + return; } From 0e7facea6da2bd360361440786785752aa5b0e30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:39:42 +0200 Subject: [PATCH 170/297] wifi: iwlegacy: work around excessive stack usage on clang/kasan In some rare randconfig builds, I seem to trigger a bug in clang where it unrolls a loop but then runs out of registers, which then get spilled to the stack: net/wireless/intel/iwlegacy/4965-rs.c:2262:1: error: stack frame size (1696) exceeds limit (1280) in 'il4965_rs_rate_init' [-Werror,-Wframe-larger-than] This seems to be the same one I saw in the omapdrm driver, and there is an easy workaround by not inlining the il4965_rs_rate_scale_clear_win function. Link: https://github.com/llvm/llvm-project/issues/143908 Signed-off-by: Arnd Bergmann Acked-by: Stanislaw Gruszka Link: https://patch.msgid.link/20250620113946.3987160-1-arnd@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlegacy/4965-rs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c index 0e5130d1fccd..031d88bf6393 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c @@ -203,7 +203,8 @@ il4965_rs_extract_rate(u32 rate_n_flags) return (u8) (rate_n_flags & 0xFF); } -static void +/* noinline works around https://github.com/llvm/llvm-project/issues/143908 */ +static noinline_for_stack void il4965_rs_rate_scale_clear_win(struct il_rate_scale_data *win) { win->data = 0; From 7a3750ff0f2e8fee338a9c168f429f6c37f0e820 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Sat, 21 Jun 2025 22:32:09 +1000 Subject: [PATCH 171/297] wifi: mac80211: fix beacon interval calculation overflow As we are converting from TU to usecs, a beacon interval of 100*1024 usecs will lead to integer wrapping. To fix change to use a u32. Fixes: 057d5f4ba1e4 ("mac80211: sync dtim_count to TSF") Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250621123209.511796-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 27d414efa3fd..a125995ed252 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3884,7 +3884,7 @@ void ieee80211_recalc_dtim(struct ieee80211_local *local, { u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; - u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; + u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; From 32ca245464e1479bfea8592b9db227fdc1641705 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:55 -0700 Subject: [PATCH 172/297] af_unix: Don't leave consecutive consumed OOB skbs. Jann Horn reported a use-after-free in unix_stream_read_generic(). The following sequences reproduce the issue: $ python3 from socket import * s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) s1.send(b'x', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'y', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'z', MSG_OOB) s2.recv(1) # recv 'z' illegally s2.recv(1, MSG_OOB) # access 'z' skb (use-after-free) Even though a user reads OOB data, the skb holding the data stays on the recv queue to mark the OOB boundary and break the next recv(). After the last send() in the scenario above, the sk2's recv queue has 2 leading consumed OOB skbs and 1 real OOB skb. Then, the following happens during the next recv() without MSG_OOB 1. unix_stream_read_generic() peeks the first consumed OOB skb 2. manage_oob() returns the next consumed OOB skb 3. unix_stream_read_generic() fetches the next not-yet-consumed OOB skb 4. unix_stream_read_generic() reads and frees the OOB skb , and the last recv(MSG_OOB) triggers KASAN splat. The 3. above occurs because of the SO_PEEK_OFF code, which does not expect unix_skb_len(skb) to be 0, but this is true for such consumed OOB skbs. while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); skb = skb_peek_next(skb, &sk->sk_receive_queue); ... } In addition to this use-after-free, there is another issue that ioctl(SIOCATMARK) does not function properly with consecutive consumed OOB skbs. So, nothing good comes out of such a situation. Instead of complicating manage_oob(), ioctl() handling, and the next ECONNRESET fix by introducing a loop for consecutive consumed OOB skbs, let's not leave such consecutive OOB unnecessarily. Now, while receiving an OOB skb in unix_stream_recv_urg(), if its previous skb is a consumed OOB skb, it is freed. [0]: BUG: KASAN: slab-use-after-free in unix_stream_read_actor (net/unix/af_unix.c:3027) Read of size 4 at addr ffff888106ef2904 by task python3/315 CPU: 2 UID: 0 PID: 315 Comm: python3 Not tainted 6.16.0-rc1-00407-gec315832f6f9 #8 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-4.fc42 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:409 mm/kasan/report.c:521) kasan_report (mm/kasan/report.c:636) unix_stream_read_actor (net/unix/af_unix.c:3027) unix_stream_read_generic (net/unix/af_unix.c:2708 net/unix/af_unix.c:2847) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f8911fcea06 Code: 5d e8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 75 19 83 e2 39 83 fa 08 75 11 e8 26 ff ff ff 66 0f 1f 44 00 00 48 8b 45 10 0f 05 <48> 8b 5d f8 c9 c3 0f 1f 40 00 f3 0f 1e fa 55 48 89 e5 48 83 ec 08 RSP: 002b:00007fffdb0dccb0 EFLAGS: 00000202 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007fffdb0dcdc8 RCX: 00007f8911fcea06 RDX: 0000000000000001 RSI: 00007f8911a5e060 RDI: 0000000000000006 RBP: 00007fffdb0dccd0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000202 R12: 00007f89119a7d20 R13: ffffffffc4653600 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) __kasan_slab_alloc (mm/kasan/common.c:348) kmem_cache_alloc_node_noprof (./include/linux/kasan.h:250 mm/slub.c:4148 mm/slub.c:4197 mm/slub.c:4249) __alloc_skb (net/core/skbuff.c:660 (discriminator 4)) alloc_skb_with_frags (./include/linux/skbuff.h:1336 net/core/skbuff.c:6668) sock_alloc_send_pskb (net/core/sock.c:2993) unix_stream_sendmsg (./include/net/sock.h:1847 net/unix/af_unix.c:2256 net/unix/af_unix.c:2418) __sys_sendto (net/socket.c:712 (discriminator 20) net/socket.c:727 (discriminator 20) net/socket.c:2226 (discriminator 20)) __x64_sys_sendto (net/socket.c:2233 (discriminator 1) net/socket.c:2229 (discriminator 1) net/socket.c:2229 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Freed by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) kasan_save_free_info (mm/kasan/generic.c:579 (discriminator 1)) __kasan_slab_free (mm/kasan/common.c:271) kmem_cache_free (mm/slub.c:4643 (discriminator 3) mm/slub.c:4745 (discriminator 3)) unix_stream_read_generic (net/unix/af_unix.c:3010) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) The buggy address belongs to the object at ffff888106ef28c0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 68 bytes inside of freed 224-byte region [ffff888106ef28c0, ffff888106ef29a0) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888106ef3cc0 pfn:0x106ef2 head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x200000000000040(head|node=0|zone=2) page_type: f5(slab) raw: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 raw: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 head: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000001 ffffea00041bbc81 00000000ffffffff 00000000ffffffff head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888106ef2800: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ffff888106ef2880: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb >ffff888106ef2900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888106ef2980: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffff888106ef2a00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Jann Horn Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jann Horn Link: https://patch.msgid.link/20250619041457.1132791-2-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..5392aa53cbc8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2680,11 +2680,11 @@ struct unix_stream_read_state { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { + struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; - struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); @@ -2699,9 +2699,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) + if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && + !unix_skb_len(oob_skb->prev)) { + read_skb = oob_skb->prev; + __skb_unlink(read_skb, &sk->sk_receive_queue); + } + } + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2712,6 +2719,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_unlock(&u->iolock); + consume_skb(read_skb); + if (chunk < 0) return -EFAULT; From e1ca44e85f652a6ebd657c67c394894c1fdfb403 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:56 -0700 Subject: [PATCH 173/297] af_unix: Add test for consecutive consumed OOB. Let's add a test case where consecutive concumed OOB skbs stay at the head of the queue. Without the previous patch, ioctl(SIOCATMARK) assertion fails. Before: # RUN msg_oob.no_peek.ex_oob_ex_oob_oob ... # msg_oob.c:305:ex_oob_ex_oob_oob:Expected answ[0] (0) == oob_head (1) # ex_oob_ex_oob_oob: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_ex_oob_oob not ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob After: # RUN msg_oob.no_peek.ex_oob_ex_oob_oob ... # OK msg_oob.no_peek.ex_oob_ex_oob_oob ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250619041457.1132791-3-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 3ed3882a93b8..918509a3f040 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -548,6 +548,29 @@ TEST_F(msg_oob, ex_oob_oob) siocatmarkpair(false); } +TEST_F(msg_oob, ex_oob_ex_oob_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("z", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); +} + TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); From 2a5a4841846b079b5fca5752fe94e59346fbda40 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:57 -0700 Subject: [PATCH 174/297] af_unix: Don't set -ECONNRESET for consumed OOB skb. Christian Brauner reported that even after MSG_OOB data is consumed, calling close() on the receiver socket causes the peer's recv() to return -ECONNRESET: 1. send() and recv() an OOB data. >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) >>> s1.send(b'x', MSG_OOB) 1 >>> s2.recv(1, MSG_OOB) b'x' 2. close() for s2 sets ECONNRESET to s1->sk_err even though s2 consumed the OOB data >>> s2.close() >>> s1.recv(10, MSG_DONTWAIT) ... ConnectionResetError: [Errno 104] Connection reset by peer Even after being consumed, the skb holding the OOB 1-byte data stays in the recv queue to mark the OOB boundary and break recv() at that point. This must be considered while close()ing a socket. Let's skip the leading consumed OOB skb while checking the -ECONNRESET condition in unix_release_sock(). Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Christian Brauner Closes: https://lore.kernel.org/netdev/20250529-sinkt-abfeuern-e7b08200c6b0@brauner/ Signed-off-by: Kuniyuki Iwashima Acked-by: Christian Brauner Link: https://patch.msgid.link/20250619041457.1132791-4-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5392aa53cbc8..52b155123985 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -660,6 +660,11 @@ static void unix_sock_destructor(struct sock *sk) #endif } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); @@ -694,10 +699,16 @@ static void unix_release_sock(struct sock *sk, int embrion) if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb && !unix_skb_len(skb)) + skb = skb_peek_next(skb, &sk->sk_receive_queue); +#endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) + if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -2661,11 +2672,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } -static unsigned int unix_skb_len(const struct sk_buff *skb) -{ - return skb->len - UNIXCB(skb).consumed; -} - struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); From 632f55fa60c481035297739ecd374d945c9b32c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:58 -0700 Subject: [PATCH 175/297] selftest: af_unix: Add tests for -ECONNRESET. A new function resetpair() calls close() for the receiver and checks the return value from recv() on the initial sender side. Now resetpair() is added to each test case and some additional test cases. Note that TCP sets -ECONNRESET to the consumed OOB, but we have decided not to touch TCP MSG_OOB code in the past. Before: # RUN msg_oob.no_peek.ex_oob_ex_oob ... # msg_oob.c:236:ex_oob_ex_oob:AF_UNIX :Connection reset by peer # msg_oob.c:237:ex_oob_ex_oob:Expected: # msg_oob.c:239:ex_oob_ex_oob:Expected ret[0] (-1) == expected_len (0) # ex_oob_ex_oob: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_ex_oob not ok 14 msg_oob.no_peek.ex_oob_ex_oob ... # FAILED: 36 / 48 tests passed. # Totals: pass:36 fail:12 xfail:0 xpass:0 skip:0 error:0 After: # RUN msg_oob.no_peek.ex_oob_ex_oob ... # msg_oob.c:244:ex_oob_ex_oob:AF_UNIX : # msg_oob.c:245:ex_oob_ex_oob:TCP :Connection reset by peer # OK msg_oob.no_peek.ex_oob_ex_oob ok 14 msg_oob.no_peek.ex_oob_ex_oob ... # PASSED: 48 / 48 tests passed. # Totals: pass:48 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250619041457.1132791-5-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 119 +++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 918509a3f040..b5f474969917 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -210,7 +210,7 @@ static void __sendpair(struct __test_metadata *_metadata, static void __recvpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, const char *expected_buf, int expected_len, - int buf_len, int flags) + int buf_len, int flags, bool is_sender) { int i, ret[2], recv_errno[2], expected_errno = 0; char recv_buf[2][BUF_SZ] = {}; @@ -221,7 +221,9 @@ static void __recvpair(struct __test_metadata *_metadata, errno = 0; for (i = 0; i < 2; i++) { - ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags); + int index = is_sender ? i * 2 : i * 2 + 1; + + ret[i] = recv(self->fd[index], recv_buf[i], buf_len, flags); recv_errno[i] = errno; } @@ -308,6 +310,20 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, ASSERT_EQ(answ[0], answ[1]); } +static void __resetpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const FIXTURE_VARIANT(msg_oob) *variant, + bool reset) +{ + int i; + + for (i = 0; i < 2; i++) + close(self->fd[i * 2 + 1]); + + __recvpair(_metadata, self, "", reset ? -ECONNRESET : 0, 1, + variant->peek ? MSG_PEEK : 0, true); +} + #define sendpair(buf, len, flags) \ __sendpair(_metadata, self, buf, len, flags) @@ -316,9 +332,10 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, if (variant->peek) \ __recvpair(_metadata, self, \ expected_buf, expected_len, \ - buf_len, (flags) | MSG_PEEK); \ + buf_len, (flags) | MSG_PEEK, false); \ __recvpair(_metadata, self, \ - expected_buf, expected_len, buf_len, flags); \ + expected_buf, expected_len, \ + buf_len, flags, false); \ } while (0) #define epollpair(oob_remaining) \ @@ -330,6 +347,9 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, #define setinlinepair() \ __setinlinepair(_metadata, self) +#define resetpair(reset) \ + __resetpair(_metadata, self, variant, reset) + #define tcp_incompliant \ for (self->tcp_compliant = false; \ self->tcp_compliant == false; \ @@ -344,6 +364,21 @@ TEST_F(msg_oob, non_oob) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(true); +} + +TEST_F(msg_oob, non_oob_no_reset) +{ + sendpair("x", 1, 0); + epollpair(false); + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); + epollpair(false); + siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob) @@ -355,6 +390,19 @@ TEST_F(msg_oob, oob) recvpair("x", 1, 1, MSG_OOB); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } +} + +TEST_F(msg_oob, oob_reset) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + resetpair(true); } TEST_F(msg_oob, oob_drop) @@ -370,6 +418,8 @@ TEST_F(msg_oob, oob_drop) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_ahead) @@ -385,6 +435,10 @@ TEST_F(msg_oob, oob_ahead) recvpair("hell", 4, 4, 0); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, oob_break) @@ -403,6 +457,8 @@ TEST_F(msg_oob, oob_break) recvpair("", -EAGAIN, 1, 0); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_ahead_break) @@ -426,6 +482,8 @@ TEST_F(msg_oob, oob_ahead_break) recvpair("world", 5, 5, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_break_drop) @@ -449,6 +507,8 @@ TEST_F(msg_oob, oob_break_drop) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, ex_oob_break) @@ -476,6 +536,8 @@ TEST_F(msg_oob, ex_oob_break) recvpair("ld", 2, 2, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, ex_oob_drop) @@ -498,6 +560,8 @@ TEST_F(msg_oob, ex_oob_drop) epollpair(false); siocatmarkpair(true); } + + resetpair(false); } TEST_F(msg_oob, ex_oob_drop_2) @@ -523,6 +587,8 @@ TEST_F(msg_oob, ex_oob_drop_2) epollpair(false); siocatmarkpair(true); } + + resetpair(false); } TEST_F(msg_oob, ex_oob_oob) @@ -546,6 +612,31 @@ TEST_F(msg_oob, ex_oob_oob) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); +} + +TEST_F(msg_oob, ex_oob_ex_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, ex_oob_ex_oob_oob) @@ -599,6 +690,10 @@ TEST_F(msg_oob, ex_oob_ahead_break) recvpair("d", 1, 1, MSG_OOB); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, ex_oob_siocatmark) @@ -618,6 +713,8 @@ TEST_F(msg_oob, ex_oob_siocatmark) recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ epollpair(true); siocatmarkpair(false); + + resetpair(true); } TEST_F(msg_oob, inline_oob) @@ -635,6 +732,8 @@ TEST_F(msg_oob, inline_oob) recvpair("x", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_oob_break) @@ -656,6 +755,8 @@ TEST_F(msg_oob, inline_oob_break) recvpair("o", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_oob_ahead_break) @@ -684,6 +785,8 @@ TEST_F(msg_oob, inline_oob_ahead_break) epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_break) @@ -709,6 +812,8 @@ TEST_F(msg_oob, inline_ex_oob_break) recvpair("rld", 3, 3, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_no_drop) @@ -730,6 +835,8 @@ TEST_F(msg_oob, inline_ex_oob_no_drop) recvpair("y", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_drop) @@ -754,6 +861,8 @@ TEST_F(msg_oob, inline_ex_oob_drop) epollpair(false); siocatmarkpair(false); } + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_siocatmark) @@ -775,6 +884,8 @@ TEST_F(msg_oob, inline_ex_oob_siocatmark) recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ epollpair(true); siocatmarkpair(false); + + resetpair(true); } TEST_HARNESS_MAIN From b272f42547d85356b035e46273ddaf2aa4e161b8 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 23 Jun 2025 07:26:27 -0700 Subject: [PATCH 176/297] ALSA: qc_audio_offload: Fix missing error code in prepare_qmi_response() When snd_soc_usb_find_priv_data() fails, return failure instead of success. While we are at it also use direct returns at first few error paths where there is no additional cleanup needed. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/Z_40qL4JnyjR4j0O@stanley.mountain/ Fixes: 326bbc348298 ("ALSA: usb-audio: qcom: Introduce QC USB SND offloading support") Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20250623142639.2938056-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index 5bc27c82e0af..797afd4561bd 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -1360,20 +1360,21 @@ static int prepare_qmi_response(struct snd_usb_substream *subs, if (!uadev[card_num].ctrl_intf) { dev_err(&subs->dev->dev, "audio ctrl intf info not cached\n"); - ret = -ENODEV; - goto err; + return -ENODEV; } ret = uaudio_populate_uac_desc(subs, resp); if (ret < 0) - goto err; + return ret; resp->slot_id = subs->dev->slot_id; resp->slot_id_valid = 1; data = snd_soc_usb_find_priv_data(uaudio_qdev->auxdev->dev.parent); - if (!data) - goto err; + if (!data) { + dev_err(&subs->dev->dev, "No private data found\n"); + return -ENODEV; + } uaudio_qdev->data = data; @@ -1382,7 +1383,7 @@ static int prepare_qmi_response(struct snd_usb_substream *subs, &resp->xhci_mem_info.tr_data, &resp->std_as_data_ep_desc); if (ret < 0) - goto err; + return ret; resp->std_as_data_ep_desc_valid = 1; @@ -1500,7 +1501,6 @@ static int prepare_qmi_response(struct snd_usb_substream *subs, xhci_sideband_remove_endpoint(uadev[card_num].sb, usb_pipe_endpoint(subs->dev, subs->data_endpoint->pipe)); -err: return ret; } From 93598167dcb6351ba40449d994244696168f1094 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Tue, 24 Jun 2025 10:14:27 +0300 Subject: [PATCH 177/297] wifi: iwlwifi: mvm: assume '1' as the default mac_config_cmd version Unfortunately, FWs of some devices don't have the version of the iwl_mac_config_cmd defined in the TLVs. We send 0 as the 'def argument to iwl_fw_lookup_cmd_ver, so for such FWs, the return value will be 0, leading to a warning, and to not sending the command. Fix this by assuming that the default version is 1. Fixes: 83f3ac2848b4 ("wifi: iwlwifi: Fix incorrect logic on cmd_ver range checking") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250624071427.2662621-1-miriam.rachel.korenblit@intel.com --- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c index 3c255ae916c8..3f8b840871d3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c @@ -32,9 +32,9 @@ static void iwl_mvm_mld_mac_ctxt_cmd_common(struct iwl_mvm *mvm, unsigned int link_id; int cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, WIDE_ID(MAC_CONF_GROUP, - MAC_CONFIG_CMD), 0); + MAC_CONFIG_CMD), 1); - if (WARN_ON(cmd_ver < 1 || cmd_ver > 3)) + if (WARN_ON(cmd_ver > 3)) return; cmd->id_and_color = cpu_to_le32(mvmvif->id); From ff21a6ec0f27c126db0a86d96751bd6e5d1d9874 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Tue, 24 Jun 2025 02:59:28 +0000 Subject: [PATCH 178/297] ASoC: rt721-sdca: fix boost gain calculation error Fix the boost gain calculation error in rt721_sdca_set_gain_get. This patch is specific for "FU33 Boost Volume". Signed-off-by: Jack Yu Link: https://patch.msgid.link/1b18fcde41c64d6fa85451d523c0434a@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt721-sdca.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/rt721-sdca.c b/sound/soc/codecs/rt721-sdca.c index 1c9f32e405cf..ba080957e933 100644 --- a/sound/soc/codecs/rt721-sdca.c +++ b/sound/soc/codecs/rt721-sdca.c @@ -430,6 +430,7 @@ static int rt721_sdca_set_gain_get(struct snd_kcontrol *kcontrol, unsigned int read_l, read_r, ctl_l = 0, ctl_r = 0; unsigned int adc_vol_flag = 0; const unsigned int interval_offset = 0xc0; + const unsigned int tendA = 0x200; const unsigned int tendB = 0xa00; if (strstr(ucontrol->id.name, "FU1E Capture Volume") || @@ -439,9 +440,16 @@ static int rt721_sdca_set_gain_get(struct snd_kcontrol *kcontrol, regmap_read(rt721->mbq_regmap, mc->reg, &read_l); regmap_read(rt721->mbq_regmap, mc->rreg, &read_r); - if (mc->shift == 8) /* boost gain */ + if (mc->shift == 8) { + /* boost gain */ ctl_l = read_l / tendB; - else { + } else if (mc->shift == 1) { + /* FU33 boost gain */ + if (read_l == 0x8000 || read_l == 0xfe00) + ctl_l = 0; + else + ctl_l = read_l / tendA + 1; + } else { if (adc_vol_flag) ctl_l = mc->max - (((0x1e00 - read_l) & 0xffff) / interval_offset); else @@ -449,9 +457,16 @@ static int rt721_sdca_set_gain_get(struct snd_kcontrol *kcontrol, } if (read_l != read_r) { - if (mc->shift == 8) /* boost gain */ + if (mc->shift == 8) { + /* boost gain */ ctl_r = read_r / tendB; - else { /* ADC/DAC gain */ + } else if (mc->shift == 1) { + /* FU33 boost gain */ + if (read_r == 0x8000 || read_r == 0xfe00) + ctl_r = 0; + else + ctl_r = read_r / tendA + 1; + } else { /* ADC/DAC gain */ if (adc_vol_flag) ctl_r = mc->max - (((0x1e00 - read_r) & 0xffff) / interval_offset); else From d87c3ca0f8f1ca4c25f2ed819e954952f4d8d709 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Jun 2025 13:07:49 +0200 Subject: [PATCH 179/297] wifi: mac80211: finish link init before RCU publish Since the link/conf pointers can be accessed without any protection other than RCU, make sure the data is actually set up before publishing the structures. Fixes: b2e8434f1829 ("wifi: mac80211: set up/tear down client vif links properly") Link: https://patch.msgid.link/20250624130749.9a308b713c74.I4a80f5eead112a38730939ea591d2e275c721256@changeid Signed-off-by: Johannes Berg --- net/mac80211/link.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d40c2bd3b50b..4f7b7d0f64f2 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -93,9 +93,6 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, if (link_id < 0) link_id = 0; - rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); - rcu_assign_pointer(sdata->link[link_id], link); - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { struct ieee80211_sub_if_data *ap_bss; struct ieee80211_bss_conf *ap_bss_conf; @@ -145,6 +142,9 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_link_debugfs_add(link); } + + rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); + rcu_assign_pointer(sdata->link[link_id], link); } void ieee80211_link_stop(struct ieee80211_link_data *link) From 0748e553df0225754c316a92af3a77fdc057b358 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 24 Jun 2025 10:25:04 -0400 Subject: [PATCH 180/297] userns and mnt_idmap leak in open_tree_attr(2) Once want_mount_setattr() has returned a positive, it does require finish_mount_kattr() to release ->mnt_userns. Failing do_mount_setattr() does not change that. As the result, we can end up leaking userns and possibly mnt_idmap as well. Fixes: c4a16820d901 ("fs: add open_tree_attr()") Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index eed83254492f..54c59e091919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5307,16 +5307,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); From f23c52aafb1675ab1d1f46914556d8e29cbbf7b3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 19 Jun 2025 08:46:17 -0300 Subject: [PATCH 181/297] serial: imx: Restore original RXTL for console to fix data loss Commit 7a637784d517 ("serial: imx: reduce RX interrupt frequency") introduced a regression on the i.MX6UL EVK board. The issue can be reproduced with the following steps: - Open vi on the board. - Paste a text file (~150 characters). - Save the file, then repeat the process. - Compare the sha256sum of the saved files. The checksums do not match due to missing characters or entire lines. Fix this by restoring the RXTL value to 1 when the UART is used as a console. This ensures timely RX interrupts and reliable data reception in console mode. With this change, pasted content is saved correctly, and checksums are always consistent. Cc: stable Fixes: 7a637784d517 ("serial: imx: reduce RX interrupt frequency") Signed-off-by: Fabio Estevam Reviewed-by: Stefan Wahren Link: https://lore.kernel.org/r/20250619114617.2791939-1-festevam@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index bd02ee898f5d..500dfc009d03 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -235,6 +235,7 @@ struct imx_port { enum imx_tx_state tx_state; struct hrtimer trigger_start_tx; struct hrtimer trigger_stop_tx; + unsigned int rxtl; }; struct imx_port_ucrs { @@ -1339,6 +1340,7 @@ static void imx_uart_clear_rx_errors(struct imx_port *sport) #define TXTL_DEFAULT 8 #define RXTL_DEFAULT 8 /* 8 characters or aging timer */ +#define RXTL_CONSOLE_DEFAULT 1 #define TXTL_DMA 8 /* DMA burst setting */ #define RXTL_DMA 9 /* DMA burst setting */ @@ -1457,7 +1459,7 @@ static void imx_uart_disable_dma(struct imx_port *sport) ucr1 &= ~(UCR1_RXDMAEN | UCR1_TXDMAEN | UCR1_ATDMAEN); imx_uart_writel(sport, ucr1, UCR1); - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl); sport->dma_is_enabled = 0; } @@ -1482,7 +1484,12 @@ static int imx_uart_startup(struct uart_port *port) return retval; } - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + if (uart_console(&sport->port)) + sport->rxtl = RXTL_CONSOLE_DEFAULT; + else + sport->rxtl = RXTL_DEFAULT; + + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl); /* disable the DREN bit (Data Ready interrupt enable) before * requesting IRQs @@ -1948,7 +1955,7 @@ static int imx_uart_poll_init(struct uart_port *port) if (retval) clk_disable_unprepare(sport->clk_ipg); - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl); uart_port_lock_irqsave(&sport->port, &flags); @@ -2040,7 +2047,7 @@ static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termio /* If the receiver trigger is 0, set it to a default value */ ufcr = imx_uart_readl(sport, UFCR); if ((ufcr & UFCR_RXTL_MASK) == 0) - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl); imx_uart_start_rx(port); } @@ -2302,7 +2309,7 @@ imx_uart_console_setup(struct console *co, char *options) else imx_uart_console_get_options(sport, &baud, &parity, &bits); - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, sport->rxtl); retval = uart_set_options(&sport->port, co, baud, parity, bits, flow); From 09812134071b3941fb81def30b61ed36d3a5dfb5 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Mon, 23 Jun 2025 09:34:45 +0000 Subject: [PATCH 182/297] dt-bindings: serial: 8250: Make clocks and clock-frequency exclusive The 8250 binding before converting to json-schema states, - clock-frequency : the input clock frequency for the UART or - clocks phandle to refer to the clk used as per Documentation/devicetree for clock-related properties, where "or" indicates these properties shouldn't exist at the same time. Additionally, the behavior of Linux's driver is strange when both clocks and clock-frequency are specified: it ignores clocks and obtains the frequency from clock-frequency, left the specified clocks unclaimed. It may even be disabled, which is undesired most of the time. But "anyOf" doesn't prevent these two properties from coexisting, as it considers the object valid as long as there's at LEAST one match. Let's switch to "oneOf" and disallows the other property if one exists, precisely matching the original binding and avoiding future confusion on the driver's behavior. Fixes: e69f5dc623f9 ("dt-bindings: serial: Convert 8250 to json-schema") Cc: stable Signed-off-by: Yao Zi Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20250623093445.62327-1-ziyao@disroot.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/8250.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index 33d2016b6509..c6bc27709bf7 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -45,7 +45,7 @@ allOf: - ns16550 - ns16550a then: - anyOf: + oneOf: - required: [ clock-frequency ] - required: [ clocks ] From 0043ec26d827ddb1e85bd9786693152aa6f55d16 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 12 Jun 2025 20:11:14 +0530 Subject: [PATCH 183/297] drm/amdgpu/gfx9: Add Cleaner Shader Support for GFX9.x GPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable the cleaner shader for other GFX9.x series of GPUs to provide data isolation between GPU workloads. The cleaner shader is responsible for clearing the Local Data Store (LDS), Vector General Purpose Registers (VGPRs), and Scalar General Purpose Registers (SGPRs), which helps prevent data leakage and ensures accurate computation results. This update extends cleaner shader support to GFX9.x GPUs, previously available for GFX9.4.2. It enhances security by clearing GPU memory between processes and maintains a consistent GPU state across KGD and KFD workloads. Cc: Manu Rastogi Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 99808926d0ea6234a89e35240a7cb088368de9e1) --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index d377a7c57d5e..ad9be3656653 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2235,6 +2235,25 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) } switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(9, 0, 1): + case IP_VERSION(9, 2, 1): + case IP_VERSION(9, 4, 0): + case IP_VERSION(9, 2, 2): + case IP_VERSION(9, 1, 0): + case IP_VERSION(9, 3, 0): + adev->gfx.cleaner_shader_ptr = gfx_9_4_2_cleaner_shader_hex; + adev->gfx.cleaner_shader_size = sizeof(gfx_9_4_2_cleaner_shader_hex); + if (adev->gfx.me_fw_version >= 167 && + adev->gfx.pfp_fw_version >= 196 && + adev->gfx.mec_fw_version >= 474) { + adev->gfx.enable_cleaner_shader = true; + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); + if (r) { + adev->gfx.enable_cleaner_shader = false; + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); + } + } + break; case IP_VERSION(9, 4, 2): adev->gfx.cleaner_shader_ptr = gfx_9_4_2_cleaner_shader_hex; adev->gfx.cleaner_shader_size = sizeof(gfx_9_4_2_cleaner_shader_hex); From 99579c55c3d6132a5236926652c0a72a526b809d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 May 2025 10:02:14 -0400 Subject: [PATCH 184/297] drm/amdgpu/mes: add compatibility checks for set_hw_resource_1 Seems some older MES firmware versions do not properly support this packet. Add back some the compatibility checks. v2: switch to fw version check (Shaoyun) Fixes: f81cd793119e ("drm/amd/amdgpu: Fix MES init sequence") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4295 Cc: Shaoyun Liu Reviewed-by: shaoyun.liu Signed-off-by: Alex Deucher (cherry picked from commit 0180e0a5dd5c6ff118043ee42dbbbddaf881f283) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 10 ++++++---- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index c9eba537de09..28eb846280dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1630,10 +1630,12 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; - r = mes_v11_0_set_hw_resources_1(&adev->mes); - if (r) { - DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); - goto failure; + if ((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x50) { + r = mes_v11_0_set_hw_resources_1(&adev->mes); + if (r) { + DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); + goto failure; + } } r = mes_v11_0_query_sched_status(&adev->mes); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index b4f17332d466..6b222630f3fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1742,7 +1742,8 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; - mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); + if ((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x4b) + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); mes_v12_0_init_aggregated_doorbell(&adev->mes); From 73eab78721f7b85216f1ca8c7b732f13213b5b32 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 17 Jun 2025 13:30:52 -0500 Subject: [PATCH 185/297] drm/amd: Adjust output for discovery error handling commit 017fbb6690c2 ("drm/amdgpu/discovery: check ip_discovery fw file available") added support for reading an amdgpu IP discovery bin file for some specific products. If it's not found then it will fallback to hardcoded values. However if it's not found there is also a lot of noise about missing files and errors. Adjust the error handling to decrease most messages to DEBUG and to show users less about missing files. Reviewed-by: Lijo Lazar Reported-by: Marcus Seyfarth Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4312 Tested-by: Marcus Seyfarth Fixes: 017fbb6690c2 ("drm/amdgpu/discovery: check ip_discovery fw file available") Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250617183052.1692059-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 49f1f9f6c3c9febf8ba93f94a8d9c8d03e1ea0a1) --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index a0e9bf9b2710..81b3443c8d7f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -321,10 +321,12 @@ static int amdgpu_discovery_read_binary_from_file(struct amdgpu_device *adev, const struct firmware *fw; int r; - r = request_firmware(&fw, fw_name, adev->dev); + r = firmware_request_nowarn(&fw, fw_name, adev->dev); if (r) { - dev_err(adev->dev, "can't load firmware \"%s\"\n", - fw_name); + if (amdgpu_discovery == 2) + dev_err(adev->dev, "can't load firmware \"%s\"\n", fw_name); + else + drm_info(&adev->ddev, "Optional firmware \"%s\" was not found\n", fw_name); return r; } @@ -459,16 +461,12 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) /* Read from file if it is the preferred option */ fw_name = amdgpu_discovery_get_fw_name(adev); if (fw_name != NULL) { - dev_info(adev->dev, "use ip discovery information from file"); + drm_dbg(&adev->ddev, "use ip discovery information from file"); r = amdgpu_discovery_read_binary_from_file(adev, adev->mman.discovery_bin, fw_name); - - if (r) { - dev_err(adev->dev, "failed to read ip discovery binary from file\n"); - r = -EINVAL; + if (r) goto out; - } - } else { + drm_dbg(&adev->ddev, "use ip discovery information from memory"); r = amdgpu_discovery_read_binary_from_mem( adev, adev->mman.discovery_bin); if (r) @@ -1338,10 +1336,8 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev) int r; r = amdgpu_discovery_init(adev); - if (r) { - DRM_ERROR("amdgpu_discovery_init failed\n"); + if (r) return r; - } wafl_ver = 0; adev->gfx.xcc_mask = 0; @@ -2579,8 +2575,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev) break; default: r = amdgpu_discovery_reg_base_init(adev); - if (r) - return -EINVAL; + if (r) { + drm_err(&adev->ddev, "discovery failed: %d\n", r); + return r; + } amdgpu_discovery_harvest_ip(adev); amdgpu_discovery_get_gfx_info(adev); From 899dec4e885f839da2065803e21b7ab003a5c609 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 19 Jun 2025 17:56:29 -0400 Subject: [PATCH 186/297] drm/amdgpu/sdma6: add ucode version checks for userq support SDMA 6.0.0 version 24 SDMA 6.0.2 version 21 SDMA 6.0.3 version 25 Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit e8cca30d8b34f1c4101c237914c53068d4a55e73) --- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 5a70ae17be04..a9bdf8d61d6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1374,9 +1374,22 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); - /* add firmware version checks here */ - if (0 && !adev->sdma.disable_uq) - adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(6, 0, 0): + if ((adev->sdma.instance[0].fw_version >= 24) && !adev->sdma.disable_uq) + adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + break; + case IP_VERSION(6, 0, 2): + if ((adev->sdma.instance[0].fw_version >= 21) && !adev->sdma.disable_uq) + adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + break; + case IP_VERSION(6, 0, 3): + if ((adev->sdma.instance[0].fw_version >= 25) && !adev->sdma.disable_uq) + adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + break; + default: + break; + } r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) From 31135cc99c40247bec924dcdcd74a58e866c52d8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Jun 2025 11:39:22 -0400 Subject: [PATCH 187/297] drm/amdgpu/sdma7: add ucode version checks for userq support SDMA 7.0.0/1: 7836028 Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit 8c011408ed842dfccdd50a90a9cf6bccdb85cc0e) --- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index ad47d0bdf777..86903eccbd4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1349,9 +1349,15 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); - /* add firmware version checks here */ - if (0 && !adev->sdma.disable_uq) - adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): + if ((adev->sdma.instance[0].fw_version >= 7836028) && !adev->sdma.disable_uq) + adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; + break; + default: + break; + } return r; } From 66abb996999de0d440a02583a6e70c2c24deab45 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 23 Jun 2025 12:11:13 -0500 Subject: [PATCH 188/297] drm/amd/display: Fix AMDGPU_MAX_BL_LEVEL value [Why] commit 16dc8bc27c2a ("drm/amd/display: Export full brightness range to userspace") adjusted the brightness range to scale to larger values, but missed updating AMDGPU_MAX_BL_LEVEL which is needed to make sure that scaling works properly with custom brightness curves. [How] As the change for max brightness of 0xFFFF only applies to devices supporting DC, use existing DC define MAX_BACKLIGHT_LEVEL. Fixes: 16dc8bc27c2a ("drm/amd/display: Export full brightness range to userspace") Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250623171114.1156451-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 5b852044eb0d3e1f1c946d32e05fcb068e0a20a0) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index bc4cd11bfc79..0b8ac9edc070 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4718,16 +4718,16 @@ static int get_brightness_range(const struct amdgpu_dm_backlight_caps *caps, return 1; } -/* Rescale from [min..max] to [0..AMDGPU_MAX_BL_LEVEL] */ +/* Rescale from [min..max] to [0..MAX_BACKLIGHT_LEVEL] */ static inline u32 scale_input_to_fw(int min, int max, u64 input) { - return DIV_ROUND_CLOSEST_ULL(input * AMDGPU_MAX_BL_LEVEL, max - min); + return DIV_ROUND_CLOSEST_ULL(input * MAX_BACKLIGHT_LEVEL, max - min); } -/* Rescale from [0..AMDGPU_MAX_BL_LEVEL] to [min..max] */ +/* Rescale from [0..MAX_BACKLIGHT_LEVEL] to [min..max] */ static inline u32 scale_fw_to_input(int min, int max, u64 input) { - return min + DIV_ROUND_CLOSEST_ULL(input * (max - min), AMDGPU_MAX_BL_LEVEL); + return min + DIV_ROUND_CLOSEST_ULL(input * (max - min), MAX_BACKLIGHT_LEVEL); } static void convert_custom_brightness(const struct amdgpu_dm_backlight_caps *caps, @@ -4947,7 +4947,7 @@ amdgpu_dm_register_backlight_device(struct amdgpu_dm_connector *aconnector) drm_dbg(drm, "Backlight caps: min: %d, max: %d, ac %d, dc %d\n", min, max, caps->ac_level, caps->dc_level); } else - props.brightness = props.max_brightness = AMDGPU_MAX_BL_LEVEL; + props.brightness = props.max_brightness = MAX_BACKLIGHT_LEVEL; if (caps->data_points && !(amdgpu_dc_debug_mask & DC_DISABLE_CUSTOM_BRIGHTNESS_CURVE)) drm_info(drm, "Using custom brightness curve\n"); From 6847b3b6e84ef37451c074e6a8db3fbd250c8dbf Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 16 Jun 2025 18:08:41 +0200 Subject: [PATCH 189/297] drm/amd/display: Add sanity checks for drm_edid_raw() When EDID is retrieved via drm_edid_raw(), it doesn't guarantee to return proper EDID bytes the caller wants: it may be either NULL (that leads to an Oops) or with too long bytes over the fixed size raw_edid array (that may lead to memory corruption). The latter was reported actually when connected with a bad adapter. Add sanity checks for drm_edid_raw() to address the above corner cases, and return EDID_BAD_INPUT accordingly. Fixes: 48edb2a4256e ("drm/amd/display: switch amdgpu_dm_connector to use struct drm_edid") Link: https://bugzilla.suse.com/show_bug.cgi?id=1236415 Signed-off-by: Takashi Iwai Signed-off-by: Alex Deucher (cherry picked from commit 648d3f4d209725d51900d6a3ed46b7b600140cdf) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index d4395b92fb85..9e3e51a2dc49 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -1029,6 +1029,10 @@ enum dc_edid_status dm_helpers_read_local_edid( return EDID_NO_RESPONSE; edid = drm_edid_raw(drm_edid); // FIXME: Get rid of drm_edid_raw() + if (!edid || + edid->extensions >= sizeof(sink->dc_edid.raw_edid) / EDID_LENGTH) + return EDID_BAD_INPUT; + sink->dc_edid.length = EDID_LENGTH * (edid->extensions + 1); memmove(sink->dc_edid.raw_edid, (uint8_t *)edid, sink->dc_edid.length); From 6c038b58a2dc5a008c7e7a1297f5aaa4deaaaa7e Mon Sep 17 00:00:00 2001 From: Tamura Dai Date: Mon, 16 Jun 2025 08:55:48 +0900 Subject: [PATCH 190/297] ASoC: SOF: Intel: hda: Use devm_kstrdup() to avoid memleak. sof_pdata->tplg_filename can have address allocated by kstrdup() and can be overwritten. Memory leak was detected with kmemleak: unreferenced object 0xffff88812391ff60 (size 16): comm "kworker/4:1", pid 161, jiffies 4294802931 hex dump (first 16 bytes): 73 6f 66 2d 68 64 61 2d 67 65 6e 65 72 69 63 00 sof-hda-generic. backtrace (crc 4bf1675c): __kmalloc_node_track_caller_noprof+0x49c/0x6b0 kstrdup+0x46/0xc0 hda_machine_select.cold+0x1de/0x12cf [snd_sof_intel_hda_generic] sof_init_environment+0x16f/0xb50 [snd_sof] sof_probe_continue+0x45/0x7c0 [snd_sof] sof_probe_work+0x1e/0x40 [snd_sof] process_one_work+0x894/0x14b0 worker_thread+0x5e5/0xfb0 kthread+0x39d/0x760 ret_from_fork+0x31/0x70 ret_from_fork_asm+0x1a/0x30 Signed-off-by: Tamura Dai Link: https://patch.msgid.link/20250615235548.8591-1-kirinode0@gmail.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index bdfe388da198..3b47191ea7a5 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -1257,11 +1257,11 @@ static int check_tplg_quirk_mask(struct snd_soc_acpi_mach *mach) return 0; } -static char *remove_file_ext(const char *tplg_filename) +static char *remove_file_ext(struct device *dev, const char *tplg_filename) { char *filename, *tmp; - filename = kstrdup(tplg_filename, GFP_KERNEL); + filename = devm_kstrdup(dev, tplg_filename, GFP_KERNEL); if (!filename) return NULL; @@ -1345,7 +1345,7 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev) */ if (!sof_pdata->tplg_filename) { /* remove file extension if it exists */ - tplg_filename = remove_file_ext(mach->sof_tplg_filename); + tplg_filename = remove_file_ext(sdev->dev, mach->sof_tplg_filename); if (!tplg_filename) return NULL; From b07f349d1864abe29436f45e3047da2bdd476462 Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Mon, 16 Jun 2025 09:13:53 +0800 Subject: [PATCH 191/297] spi: spi-cadence-quadspi: Fix pm runtime unbalance Having PM put sync in remove function is causing PM underflow during remove operation. This is caused by the function, runtime_pm_get_sync, not being called anywhere during the op. Ensure that calls to pm_runtime_enable()/pm_runtime_disable() and pm_runtime_get_sync()/pm_runtime_put_sync() match. echo 108d2000.spi > /sys/bus/platform/drivers/cadence-qspi/unbind [ 49.644256] Deleting MTD partitions on "108d2000.spi.0": [ 49.649575] Deleting u-boot MTD partition [ 49.684087] Deleting root MTD partition [ 49.724188] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! Continuous bind/unbind will result in an "Unbalanced pm_runtime_enable" error. Subsequent unbind attempts will return a "No such device" error, while bind attempts will return a "Resource temporarily unavailable" error. [ 47.592434] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 49.592233] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 53.232309] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 55.828550] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 57.940627] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 59.912490] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 61.876243] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 61.883000] platform 108d2000.spi: Unbalanced pm_runtime_enable! [ 532.012270] cadence-qspi 108d2000.spi: probe with driver cadence-qspi failed1 Also, change clk_disable_unprepare() to clk_disable() since continuous bind and unbind operations will trigger a warning indicating that the clock is already unprepared. Fixes: 4892b374c9b7 ("mtd: spi-nor: cadence-quadspi: Add runtime PM support") cc: stable@vger.kernel.org # 6.6+ Signed-off-by: Khairul Anuar Romli Reviewed-by: Matthew Gerlach Link: https://patch.msgid.link/4e7a4b8aba300e629b45a04f90bddf665fbdb335.1749601877.git.khairul.anuar.romli@altera.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index fe0f122f07b0..aa1932ba17cb 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1958,10 +1958,10 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - ret = devm_pm_runtime_enable(dev); - if (ret) { - if (cqspi->rx_chan) - dma_release_channel(cqspi->rx_chan); + pm_runtime_enable(dev); + + if (cqspi->rx_chan) { + dma_release_channel(cqspi->rx_chan); goto probe_setup_failed; } @@ -1981,6 +1981,7 @@ static int cqspi_probe(struct platform_device *pdev) return 0; probe_setup_failed: cqspi_controller_enable(cqspi, 0); + pm_runtime_disable(dev); probe_reset_failed: if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); @@ -1999,7 +2000,8 @@ static void cqspi_remove(struct platform_device *pdev) if (cqspi->rx_chan) dma_release_channel(cqspi->rx_chan); - clk_disable_unprepare(cqspi->clk); + if (pm_runtime_get_sync(&pdev->dev) >= 0) + clk_disable(cqspi->clk); if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); From 62207293479e6c03ef498a70f2914c51f4d31d2c Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Fri, 16 May 2025 15:16:55 +0300 Subject: [PATCH 192/297] drm/xe/display: Add check for alloc_ordered_workqueue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add check for the return value of alloc_ordered_workqueue() in xe_display_create() to catch potential exception. Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/4ee1b0e5d1626ce1dde2e82af05c2edaed50c3aa.1747397638.git.jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 5b62d63395d5b7d4094e7cd380bccae4b25415cb) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/display/xe_display.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 68f064f33d4b..9f4ade25787a 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -104,6 +104,8 @@ int xe_display_create(struct xe_device *xe) spin_lock_init(&xe->display.fb_tracking.lock); xe->display.hotplug.dp_wq = alloc_ordered_workqueue("xe-dp", 0); + if (!xe->display.hotplug.dp_wq) + return -ENOMEM; return drmm_add_action_or_reset(&xe->drm, display_destroy, NULL); } From 9127a69c7193ad47047ff968a2de9161d5c93d37 Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Tue, 17 Jun 2025 17:30:30 +0530 Subject: [PATCH 193/297] drm/xe/hwmon: Fix xe_hwmon_power_max_write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevent other bits of mailbox power limit from being overwritten with 0. This issue was due to a missing read and modify of current power limit, before setting a requested mailbox power limit, which is added in this patch. v2: - Improve commit message. (Anshuman) v3: - Rebase. - Rephrase commit message. (Riana) - Add read-modify-write variant of xe_hwmon_pcode_write_power_limit() i.e. xe_hwmon_pcode_rmw_power_limit(). (Badal) - Use xe_hwmon_pcode_rmw_power_limit() to set mailbox power limits. - Remove xe_hwmon_pcode_write_power_limit() as all mailbox power limits writes use xe_hwmon_pcode_rmw_power_limit() only. v4: - Use PWR_LIM in place of (PWR_LIM_EN | PWR_LIM_VAL) wherever applicable. (Riana) Fixes: 25a2aa779fc3 ("drm/xe/hwmon: Add support to manage power limits though mailbox") Reviewed-by: Riana Tauro Signed-off-by: Karthik Poosa Reviewed-by: Badal Nilawar Link: https://lore.kernel.org/r/20250617120030.612819-1-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 8aa7306631f088881759398972d503757cf0c901) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/regs/xe_mchbar_regs.h | 1 + drivers/gpu/drm/xe/xe_hwmon.c | 34 +++++++++++------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h index 5394a1373a6b..ef2bf984723f 100644 --- a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h @@ -40,6 +40,7 @@ #define PCU_CR_PACKAGE_RAPL_LIMIT XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0) #define PWR_LIM_VAL REG_GENMASK(14, 0) #define PWR_LIM_EN REG_BIT(15) +#define PWR_LIM REG_GENMASK(15, 0) #define PWR_LIM_TIME REG_GENMASK(23, 17) #define PWR_LIM_TIME_X REG_GENMASK(23, 22) #define PWR_LIM_TIME_Y REG_GENMASK(21, 17) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index 74f31639b37f..f008e8049700 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -159,8 +159,8 @@ static int xe_hwmon_pcode_read_power_limit(const struct xe_hwmon *hwmon, u32 att return ret; } -static int xe_hwmon_pcode_write_power_limit(const struct xe_hwmon *hwmon, u32 attr, u8 channel, - u32 uval) +static int xe_hwmon_pcode_rmw_power_limit(const struct xe_hwmon *hwmon, u32 attr, u8 channel, + u32 clr, u32 set) { struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe); u32 val0, val1; @@ -179,7 +179,7 @@ static int xe_hwmon_pcode_write_power_limit(const struct xe_hwmon *hwmon, u32 at channel, val0, val1, ret); if (attr == PL1_HWMON_ATTR) - val0 = uval; + val0 = (val0 & ~clr) | set; else return -EIO; @@ -339,7 +339,7 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe if (hwmon->xe->info.has_mbx_power_limits) { drm_dbg(&hwmon->xe->drm, "disabling %s on channel %d\n", PWR_ATTR_TO_STR(attr), channel); - xe_hwmon_pcode_write_power_limit(hwmon, attr, channel, 0); + xe_hwmon_pcode_rmw_power_limit(hwmon, attr, channel, PWR_LIM_EN, 0); xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, ®_val); } else { reg_val = xe_mmio_rmw32(mmio, rapl_limit, PWR_LIM_EN, 0); @@ -370,10 +370,9 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe } if (hwmon->xe->info.has_mbx_power_limits) - ret = xe_hwmon_pcode_write_power_limit(hwmon, attr, channel, reg_val); + ret = xe_hwmon_pcode_rmw_power_limit(hwmon, attr, channel, PWR_LIM, reg_val); else - reg_val = xe_mmio_rmw32(mmio, rapl_limit, PWR_LIM_EN | PWR_LIM_VAL, - reg_val); + reg_val = xe_mmio_rmw32(mmio, rapl_limit, PWR_LIM, reg_val); unlock: mutex_unlock(&hwmon->hwmon_lock); return ret; @@ -563,14 +562,11 @@ xe_hwmon_power_max_interval_store(struct device *dev, struct device_attribute *a mutex_lock(&hwmon->hwmon_lock); - if (hwmon->xe->info.has_mbx_power_limits) { - ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, (u32 *)&r); - r = (r & ~PWR_LIM_TIME) | rxy; - xe_hwmon_pcode_write_power_limit(hwmon, power_attr, channel, r); - } else { + if (hwmon->xe->info.has_mbx_power_limits) + xe_hwmon_pcode_rmw_power_limit(hwmon, power_attr, channel, PWR_LIM_TIME, rxy); + else r = xe_mmio_rmw32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel), PWR_LIM_TIME, rxy); - } mutex_unlock(&hwmon->hwmon_lock); @@ -1138,12 +1134,12 @@ xe_hwmon_get_preregistration_info(struct xe_hwmon *hwmon) } else { drm_info(&hwmon->xe->drm, "Using mailbox commands for power limits\n"); /* Write default limits to read from pcode from now on. */ - xe_hwmon_pcode_write_power_limit(hwmon, PL1_HWMON_ATTR, - CHANNEL_CARD, - hwmon->pl1_on_boot[CHANNEL_CARD]); - xe_hwmon_pcode_write_power_limit(hwmon, PL1_HWMON_ATTR, - CHANNEL_PKG, - hwmon->pl1_on_boot[CHANNEL_PKG]); + xe_hwmon_pcode_rmw_power_limit(hwmon, PL1_HWMON_ATTR, + CHANNEL_CARD, PWR_LIM | PWR_LIM_TIME, + hwmon->pl1_on_boot[CHANNEL_CARD]); + xe_hwmon_pcode_rmw_power_limit(hwmon, PL1_HWMON_ATTR, + CHANNEL_PKG, PWR_LIM | PWR_LIM_TIME, + hwmon->pl1_on_boot[CHANNEL_PKG]); hwmon->scl_shift_power = PWR_UNIT; hwmon->scl_shift_energy = ENERGY_UNIT; hwmon->scl_shift_time = TIME_UNIT; From 5c4acbc8ce9025fe6e318966af8d3c42ffc6e9ca Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Wed, 25 Jun 2025 03:10:27 +0800 Subject: [PATCH 194/297] bcachefs: Don't unlock the trans if ret doesn't match BCH_ERR_operation_blocked Reported-by: syzbot+d540192e763531d307ff@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 7bf1bd6a6e92..553059b33bfd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1287,10 +1287,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, do { ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - + if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); - } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } while (1); } if (ret) { From 865ad1dbf13244b349e3da2731c1188476ad17b8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Jun 2025 18:42:42 -0400 Subject: [PATCH 195/297] bcachefs: Check for bad write buffer key when moving from journal Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.h | 5 ++--- fs/bcachefs/btree_write_buffer.c | 5 ++--- fs/bcachefs/btree_write_buffer.h | 6 ++++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 9feef1dc4de5..0b98ab959719 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -170,8 +170,7 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); -int bch2_btree_write_buffer_insert_err(struct btree_trans *, - enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, @@ -182,7 +181,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); dump_stack(); return ret; } diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 21b5c03d1822..4b095235a0d2 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -267,10 +267,9 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } -int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, +int bch2_btree_write_buffer_insert_err(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { - struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; prt_printf(&buf, "attempting to do write buffer update on non wb btree="); @@ -332,7 +331,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); goto err; } diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 05f56fd1eed0..c351d21aca0b 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -89,6 +89,12 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(c, btree, k); + dump_stack(); + return ret; + } + EBUG_ON(!dst->seq); return k->k.type == KEY_TYPE_accounting From 5f465c148c61e876b6d6eacd8e8e365f2d47758f Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Fri, 20 Jun 2025 16:15:03 -0700 Subject: [PATCH 196/297] x86/traps: Initialize DR6 by writing its architectural reset value Initialize DR6 by writing its architectural reset value to avoid incorrectly zeroing DR6 to clear DR6.BLD at boot time, which leads to a false bus lock detected warning. The Intel SDM says: 1) Certain debug exceptions may clear bits 0-3 of DR6. 2) BLD induced #DB clears DR6.BLD and any other debug exception doesn't modify DR6.BLD. 3) RTM induced #DB clears DR6.RTM and any other debug exception sets DR6.RTM. To avoid confusion in identifying debug exceptions, debug handlers should set DR6.BLD and DR6.RTM, and clear other DR6 bits before returning. The DR6 architectural reset value 0xFFFF0FF0, already defined as macro DR6_RESERVED, satisfies these requirements, so just use it to reinitialize DR6 whenever needed. Since clear_all_debug_regs() no longer zeros all debug registers, rename it to initialize_debug_regs() to better reflect its current behavior. Since debug_read_clear_dr6() no longer clears DR6, rename it to debug_read_reset_dr6() to better reflect its current behavior. Fixes: ebb1064e7c2e9 ("x86/traps: Handle #DB for bus lock") Reported-by: Sohil Mehta Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Reviewed-by: H. Peter Anvin (Intel) Reviewed-by: Sohil Mehta Acked-by: Peter Zijlstra (Intel) Tested-by: Sohil Mehta Link: https://lore.kernel.org/lkml/06e68373-a92b-472e-8fd9-ba548119770c@intel.com/ Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250620231504.2676902-2-xin%40zytor.com --- arch/x86/include/uapi/asm/debugreg.h | 21 ++++++++++++++++- arch/x86/kernel/cpu/common.c | 24 ++++++++------------ arch/x86/kernel/traps.c | 34 +++++++++++++++++----------- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index 0007ba077c0c..41da492dfb01 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -15,7 +15,26 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ -/* Define reserved bits in DR6 which are always set to 1 */ +/* + * Define bits in DR6 which are set to 1 by default. + * + * This is also the DR6 architectural value following Power-up, Reset or INIT. + * + * Note, with the introduction of Bus Lock Detection (BLD) and Restricted + * Transactional Memory (RTM), the DR6 register has been modified: + * + * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports + * Bus Lock Detection. The assertion of a bus lock could clear it. + * + * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports + * restricted transactional memory. #DB occurred inside an RTM region + * could clear it. + * + * Apparently, DR6.BLD and DR6.RTM are active low bits. + * + * As a result, DR6_RESERVED is an incorrect name now, but it is kept for + * compatibility. + */ #define DR6_RESERVED (0xFFFF0FF0) #define DR_TRAP0 (0x1) /* db0 */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8feb8fd2957a..0f6c280a94f0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2243,20 +2243,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) +static void initialize_debug_regs(void) { - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } + /* Control register first -- to make sure everything is disabled. */ + set_debugreg(0, 7); + set_debugreg(DR6_RESERVED, 6); + /* dr5 and dr4 don't exist */ + set_debugreg(0, 3); + set_debugreg(0, 2); + set_debugreg(0, 1); + set_debugreg(0, 0); } #ifdef CONFIG_KGDB @@ -2417,7 +2413,7 @@ void cpu_init(void) load_mm_ldt(&init_mm); - clear_all_debug_regs(); + initialize_debug_regs(); dbg_restore_debug_regs(); doublefault_init_cpu_tss(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c5c897a86418..36354b470590 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1022,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. * - * Keep it simple: clear DR6 immediately. + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. + * + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. + * + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -1239,13 +1247,13 @@ static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED @@ -1264,7 +1272,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which - * debug_read_clear_dr6() returns for the IDT entry points. + * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); @@ -1279,7 +1287,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); From fa7d0f83c5c4223a01598876352473cb3d3bd4d7 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Fri, 20 Jun 2025 16:15:04 -0700 Subject: [PATCH 197/297] x86/traps: Initialize DR7 by writing its architectural reset value Initialize DR7 by writing its architectural reset value to always set bit 10, which is reserved to '1', when "clearing" DR7 so as not to trigger unanticipated behavior if said bit is ever unreserved, e.g. as a feature enabling flag with inverted polarity. Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Reviewed-by: H. Peter Anvin (Intel) Reviewed-by: Sohil Mehta Acked-by: Peter Zijlstra (Intel) Acked-by: Sean Christopherson Tested-by: Sohil Mehta Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250620231504.2676902-3-xin%40zytor.com --- arch/x86/include/asm/debugreg.h | 19 +++++++++++++++---- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 7 files changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 363110e6b2e3..a2c1f2d24b64 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -9,6 +9,14 @@ #include #include +/* + * Define bits that are always set to 1 in DR7, only bit 10 is + * architecturally reserved to '1'. + * + * This is also the init/reset value for DR7. + */ +#define DR7_FIXED_1 0x00000400 + DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL @@ -100,8 +108,8 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) static inline void hw_breakpoint_disable(void) { - /* Zero the control register for HW Breakpoint */ - set_debugreg(0UL, 7); + /* Reset the control register for HW Breakpoint */ + set_debugreg(DR7_FIXED_1, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); @@ -125,9 +133,12 @@ static __always_inline unsigned long local_db_save(void) return 0; get_debugreg(dr7, 7); - dr7 &= ~0x400; /* architecturally set bit */ + + /* Architecturally set bit */ + dr7 &= ~DR7_FIXED_1; if (dr7) - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); + /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4a391929cdb..639d9bcee842 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,6 @@ enum x86_intercept_stage; #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) -#define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff #define KVM_GUESTDBG_VALID_MASK \ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f6c280a94f0..27125e009847 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2246,7 +2246,7 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); static void initialize_debug_regs(void) { /* Control register first -- to make sure everything is disabled. */ - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); set_debugreg(DR6_RESERVED, 6); /* dr5 and dr4 don't exist */ set_debugreg(0, 3); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 102641fd2172..8b1a9733d13e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) struct perf_event *bp; /* Disable hardware debugging while we are in kgdb: */ - set_debugreg(0UL, 7); + set_debugreg(DR7_FIXED_1, 7); for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a10e180cbf23..3ef15c2f152f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1)) return; printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8d6cf25127aa..b972bf72fb8b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -133,7 +133,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400))) { + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b58a74c1722d..a9d992d5652f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11035,7 +11035,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); @@ -11044,7 +11044,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); } vcpu->arch.host_debugctl = get_debugctlmsr(); From f5109c201cf2bc304a05ecc40c2aabb119b27833 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 24 Jun 2025 17:53:00 -0400 Subject: [PATCH 198/297] bcachefs: Use wait_on_allocator() when allocating journal wait_on_allocator() emits debug info when we hang trying to allocate. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index df71af0013ba..f22b05e02c1e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1283,7 +1283,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } return ret; From 1f8aede70d491a1d5867f575ca44c86fe2e335ae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 24 Jun 2025 16:04:32 -0400 Subject: [PATCH 199/297] bcachefs: fix bch2_journal_keys_peek_prev_min() underflow Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index a41fabd06332..ea839560a136 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -137,12 +137,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b struct journal_key *k; BUG_ON(*idx > keys->nr); + + if (!keys->nr) + return NULL; search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) >= 0) { + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { (*idx)++; iters++; if (iters == 10) { @@ -151,18 +154,23 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } + if (*idx == keys->nr) + --(*idx); + struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + while (true) { + k = idx_to_key(keys, *idx); if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start - 1; - else - *idx -= 1; + *idx = rcu_dereference(k->overwritten_range)->start; + if (!*idx) + break; + --(*idx); continue; } @@ -171,6 +179,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b break; } + if (!*idx) + break; --(*idx); iters++; if (iters == 10) { From 22bbc1dcd0d6785fb390c41f0dd5b5e218d23bdd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 23 Jun 2025 12:00:53 +0200 Subject: [PATCH 200/297] vsock/uapi: fix linux/vm_sockets.h userspace compilation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a userspace application just include will fail to build with the following errors: /usr/include/linux/vm_sockets.h:182:39: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’ 182 | unsigned char svm_zero[sizeof(struct sockaddr) - | ^~~~~~ /usr/include/linux/vm_sockets.h:183:39: error: ‘sa_family_t’ undeclared here (not in a function) 183 | sizeof(sa_family_t) - | Include for userspace (guarded by ifndef __KERNEL__) where `struct sockaddr` and `sa_family_t` are defined. We already do something similar in and . Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Daan De Meyer Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250623100053.40979-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/vm_sockets.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index ed07181d4eff..e05280e41522 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -17,6 +17,10 @@ #ifndef _UAPI_VM_SOCKETS_H #define _UAPI_VM_SOCKETS_H +#ifndef __KERNEL__ +#include /* for struct sockaddr and sa_family_t */ +#endif + #include #include From 9caca6ac0e26cd20efd490d8b3b2ffb1c7c00f6f Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 23 Jun 2025 09:06:38 -0700 Subject: [PATCH 201/297] bnxt: properly flush XDP redirect lists We encountered following crash when testing a XDP_REDIRECT feature in production: [56251.579676] list_add corruption. next->prev should be prev (ffff93120dd40f30), but was ffffb301ef3a6740. (next=ffff93120dd 40f30). [56251.601413] ------------[ cut here ]------------ [56251.611357] kernel BUG at lib/list_debug.c:29! [56251.621082] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [56251.632073] CPU: 111 UID: 0 PID: 0 Comm: swapper/111 Kdump: loaded Tainted: P O 6.12.33-cloudflare-2025.6. 3 #1 [56251.653155] Tainted: [P]=PROPRIETARY_MODULE, [O]=OOT_MODULE [56251.663877] Hardware name: MiTAC GC68B-B8032-G11P6-GPU/S8032GM-HE-CFR, BIOS V7.020.B10-sig 01/22/2025 [56251.682626] RIP: 0010:__list_add_valid_or_report+0x4b/0xa0 [56251.693203] Code: 0e 48 c7 c7 68 e7 d9 97 e8 42 16 fe ff 0f 0b 48 8b 52 08 48 39 c2 74 14 48 89 f1 48 c7 c7 90 e7 d9 97 48 89 c6 e8 25 16 fe ff <0f> 0b 4c 8b 02 49 39 f0 74 14 48 89 d1 48 c7 c7 e8 e7 d9 97 4c 89 [56251.725811] RSP: 0018:ffff93120dd40b80 EFLAGS: 00010246 [56251.736094] RAX: 0000000000000075 RBX: ffffb301e6bba9d8 RCX: 0000000000000000 [56251.748260] RDX: 0000000000000000 RSI: ffff9149afda0b80 RDI: ffff9149afda0b80 [56251.760349] RBP: ffff9131e49c8000 R08: 0000000000000000 R09: ffff93120dd40a18 [56251.772382] R10: ffff9159cf2ce1a8 R11: 0000000000000003 R12: ffff911a80850000 [56251.784364] R13: ffff93120fbc7000 R14: 0000000000000010 R15: ffff9139e7510e40 [56251.796278] FS: 0000000000000000(0000) GS:ffff9149afd80000(0000) knlGS:0000000000000000 [56251.809133] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [56251.819561] CR2: 00007f5e85e6f300 CR3: 00000038b85e2006 CR4: 0000000000770ef0 [56251.831365] PKRU: 55555554 [56251.838653] Call Trace: [56251.845560] [56251.851943] cpu_map_enqueue.cold+0x5/0xa [56251.860243] xdp_do_redirect+0x2d9/0x480 [56251.868388] bnxt_rx_xdp+0x1d8/0x4c0 [bnxt_en] [56251.877028] bnxt_rx_pkt+0x5f7/0x19b0 [bnxt_en] [56251.885665] ? cpu_max_write+0x1e/0x100 [56251.893510] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.902276] __bnxt_poll_work+0x190/0x340 [bnxt_en] [56251.911058] bnxt_poll+0xab/0x1b0 [bnxt_en] [56251.919041] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.927568] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.935958] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.944250] __napi_poll+0x2b/0x160 [56251.951155] bpf_trampoline_6442548651+0x79/0x123 [56251.959262] __napi_poll+0x5/0x160 [56251.966037] net_rx_action+0x3d2/0x880 [56251.973133] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.981265] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.989262] ? __hrtimer_run_queues+0x162/0x2a0 [56251.996967] ? srso_alias_return_thunk+0x5/0xfbef5 [56252.004875] ? srso_alias_return_thunk+0x5/0xfbef5 [56252.012673] ? bnxt_msix+0x62/0x70 [bnxt_en] [56252.019903] handle_softirqs+0xcf/0x270 [56252.026650] irq_exit_rcu+0x67/0x90 [56252.032933] common_interrupt+0x85/0xa0 [56252.039498] [56252.044246] [56252.048935] asm_common_interrupt+0x26/0x40 [56252.055727] RIP: 0010:cpuidle_enter_state+0xb8/0x420 [56252.063305] Code: dc 01 00 00 e8 f9 79 3b ff e8 64 f7 ff ff 49 89 c5 0f 1f 44 00 00 31 ff e8 a5 32 3a ff 45 84 ff 0f 85 ae 01 00 00 fb 45 85 f6 <0f> 88 88 01 00 00 48 8b 04 24 49 63 ce 4c 89 ea 48 6b f1 68 48 29 [56252.088911] RSP: 0018:ffff93120c97fe98 EFLAGS: 00000202 [56252.096912] RAX: ffff9149afd80000 RBX: ffff9141d3a72800 RCX: 0000000000000000 [56252.106844] RDX: 00003329176c6b98 RSI: ffffffe36db3fdc7 RDI: 0000000000000000 [56252.116733] RBP: 0000000000000002 R08: 0000000000000002 R09: 000000000000004e [56252.126652] R10: ffff9149afdb30c4 R11: 071c71c71c71c71c R12: ffffffff985ff860 [56252.136637] R13: 00003329176c6b98 R14: 0000000000000002 R15: 0000000000000000 [56252.146667] ? cpuidle_enter_state+0xab/0x420 [56252.153909] cpuidle_enter+0x2d/0x40 [56252.160360] do_idle+0x176/0x1c0 [56252.166456] cpu_startup_entry+0x29/0x30 [56252.173248] start_secondary+0xf7/0x100 [56252.179941] common_startup_64+0x13e/0x141 [56252.186886] From the crash dump, we found that the cpu_map_flush_list inside redirect info is partially corrupted: its list_head->next points to itself, but list_head->prev points to a valid list of unflushed bq entries. This turned out to be a result of missed XDP flush on redirect lists. By digging in the actual source code, we found that commit 7f0a168b0441 ("bnxt_en: Add completion ring pointer in TX and RX ring structures") incorrectly overwrites the event mask for XDP_REDIRECT in bnxt_rx_xdp. We can stably reproduce this crash by returning XDP_TX and XDP_REDIRECT randomly for incoming packets in a naive XDP program. Properly propagate the XDP_REDIRECT events back fixes the crash. Fixes: a7559bc8c17c ("bnxt: support transmit and free of aggregation buffers") Tested-by: Andrew Rzeznik Signed-off-by: Yan Zhai Acked-by: Jesper Dangaard Brouer Reviewed-by: Michael Chan Reviewed-by: Andy Gospodarek Link: https://patch.msgid.link/aFl7jpCNzscumuN2@debian.debian Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2cb3185c442c..ae89a981e052 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2989,6 +2989,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, { struct bnxt_napi *bnapi = cpr->bnapi; u32 raw_cons = cpr->cp_raw_cons; + bool flush_xdp = false; u32 cons; int rx_pkts = 0; u8 event = 0; @@ -3042,6 +3043,8 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, else rc = bnxt_force_rx_discard(bp, cpr, &raw_cons, &event); + if (event & BNXT_REDIRECT_EVENT) + flush_xdp = true; if (likely(rc >= 0)) rx_pkts += rc; /* Increment rx_pkts when rc is -ENOMEM to count towards @@ -3066,7 +3069,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, } } - if (event & BNXT_REDIRECT_EVENT) { + if (flush_xdp) { xdp_do_flush(); event &= ~BNXT_REDIRECT_EVENT; } From 524346e9d79f63a6e5aaa645140da3d1ec7a8a0f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 25 Jun 2025 10:25:54 +0800 Subject: [PATCH 202/297] ublk: build batch from IOs in same io_ring_ctx and io task ublk_queue_cmd_list() dispatches the whole batch list by scheduling task work via the tail request's io_uring_cmd, this way is fine even though more than one io_ring_ctx are involved for this batch since it is just one running context. However, the task work handler ublk_cmd_list_tw_cb() takes `issue_flags` of tail uring_cmd's io_ring_ctx for completing all commands. This way is wrong if any uring_cmd is issued from different io_ring_ctx. Fixes it by always building batch IOs from same io_ring_ctx and io task because ublk_dispatch_req() does validate task context, and IO needs to be aborted in case of running from fallback task work context. For typical per-queue or per-io daemon implementation, this way shouldn't make difference from performance viewpoint, because single io_ring_ctx is taken in each daemon for normal use case. Fixes: d796cea7b9f3 ("ublk: implement ->queue_rqs()") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250625022554.883571-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d36f44f5ee80..90c02ced392a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1416,6 +1416,14 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, + const struct ublk_io *io2) +{ + return (io_uring_cmd_ctx_handle(io->cmd) == + io_uring_cmd_ctx_handle(io2->cmd)) && + (io->task == io2->task); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1427,7 +1435,8 @@ static void ublk_queue_rqs(struct rq_list *rqlist) struct ublk_queue *this_q = req->mq_hctx->driver_data; struct ublk_io *this_io = &this_q->ios[req->tag]; - if (io && io->task != this_io->task && !rq_list_empty(&submit_list)) + if (io && !ublk_belong_to_same_batch(io, this_io) && + !rq_list_empty(&submit_list)) ublk_queue_cmd_list(io, &submit_list); io = this_io; From 5223372e672eaa576c892984b438875426d8ff2b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 23 Jun 2025 09:19:27 +0800 Subject: [PATCH 203/297] selftests: ublk: don't take same backing file for more than one ublk devices Don't use same backing file for more than one ublk devices, and avoid concurrent write on same file from more ublk disks. Fixes: 8ccebc19ee3d ("selftests: ublk: support UBLK_F_AUTO_BUF_REG") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250623011934.741788-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_stress_03.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 6eef282d569f..3ed4c9b2d8c0 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -32,22 +32,23 @@ _create_backfile 2 128M ublk_io_and_remove 8G -t null -q 4 -z & ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait if _have_feature "AUTO_BUF_REG"; then ublk_io_and_remove 8G -t null -q 4 --auto_zc & ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback & + wait fi -wait if _have_feature "PER_IO_DAEMON"; then ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks & ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks & + wait fi -wait _cleanup_test "stress" _show_result $TID $ERR_CODE From 67caa528ae08cd05e485c0ea6aea0baaf6579b06 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 10:28:41 -0600 Subject: [PATCH 204/297] ublk: fix narrowing warnings in UAPI header When a C++ file compiled with -Wc++11-narrowing includes the UAPI header linux/ublk_cmd.h, ublk_sqe_addr_to_auto_buf_reg()'s assignments of u64 values to u8, u16, and u32 fields result in compiler warnings. Add explicit casts to the intended types to avoid these warnings. Drop the unnecessary bitmasks. Reported-by: Uday Shankar Signed-off-by: Caleb Sander Mateos Fixes: 99c1e4eb6a3f ("ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621162842.337452-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 77d9d6af46da..c062109cb686 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -450,10 +450,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg( __u64 sqe_addr) { struct ublk_auto_buf_reg reg = { - .index = sqe_addr & 0xffff, - .flags = (sqe_addr >> 16) & 0xff, - .reserved0 = (sqe_addr >> 24) & 0xff, - .reserved1 = sqe_addr >> 32, + .index = (__u16)sqe_addr, + .flags = (__u8)(sqe_addr >> 16), + .reserved0 = (__u8)(sqe_addr >> 24), + .reserved1 = (__u32)(sqe_addr >> 32), }; return reg; From 81b4d1a1d03301dcca8af5c58eded9e535f1f6ed Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 11:10:14 -0600 Subject: [PATCH 205/297] ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header UBLK_F_SUPPORT_ZERO_COPY has a very old comment describing the initial idea for how zero-copy would be implemented. The actual implementation added in commit 1f6540e2aabb ("ublk: zc register/unregister bvec") uses io_uring registered buffers rather than shared memory mapping. Remove the inaccurate remarks about mapping ublk request memory into the ublk server's address space and requiring 4K block size. Replace them with a description of the current zero-copy mechanism. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621171015.354932-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c062109cb686..c9751bdfd937 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -135,8 +135,28 @@ #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) /* - * zero copy requires 4k block size, and can remap ublk driver's io - * request into ublksrv's vm space + * ublk server can register data buffers for incoming I/O requests with a sparse + * io_uring buffer table. The request buffer can then be used as the data buffer + * for io_uring operations via the fixed buffer index. + * Note that the ublk server can never directly access the request data memory. + * + * To use this feature, the ublk server must first register a sparse buffer + * table on an io_uring instance. + * When an incoming ublk request is received, the ublk server submits a + * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The + * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register + * and addr is the index in the io_uring's buffer table to install the buffer. + * SQEs can now be submitted to the io_uring to read/write the request's buffer + * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or + * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index. + * Once the last io_uring operation using the request's buffer has completed, + * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag, + * and addr again specifying the request buffer to unregister. + * The ublk request is completed when its buffer is unregistered from all + * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ. + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak + * uninitialized kernel memory by not reading into the full request buffer. */ #define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) From 4c8a951787ffc4b61a547db9866196104971b5fd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 24 Jun 2025 18:41:21 +0800 Subject: [PATCH 206/297] ublk: setup ublk_io correctly in case of ublk_get_data() failure If ublk_get_data() fails, -EIOCBQUEUED is returned and the current command becomes ASYNC. And the only reason is that mapping data can't move on, because of no enough pages or pending signal, then the current ublk request has to be requeued. Once the request need to be requeued, we have to setup `ublk_io` correctly, including io->cmd and flags, otherwise the request may not be forwarded to ublk server successfully. Fixes: 9810362a57cb ("ublk: don't call ublk_dispatch_req() for NEED_GET_DATA") Reported-by: Changhui Zhong Closes: https://lore.kernel.org/linux-block/CAGVVp+VN9QcpHUz_0nasFf5q9i1gi8H8j-G-6mkBoqa3TyjRHA@mail.gmail.com/ Signed-off-by: Ming Lei Tested-by: Changhui Zhong Link: https://lore.kernel.org/r/20250624104121.859519-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 90c02ced392a..d441d3259edb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1148,8 +1148,8 @@ static inline void __ublk_complete_rq(struct request *req) blk_mq_end_request(req, res); } -static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, - int res, unsigned issue_flags) +static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, + struct request *req) { /* read cmd first because req will overwrite it */ struct io_uring_cmd *cmd = io->cmd; @@ -1164,6 +1164,13 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, io->flags &= ~UBLK_IO_FLAG_ACTIVE; io->req = req; + return cmd; +} + +static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, + int res, unsigned issue_flags) +{ + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); /* tell ublksrv one io request is coming */ io_uring_cmd_done(cmd, res, 0, issue_flags); @@ -2157,10 +2164,9 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq, return 0; } -static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io) +static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) { - struct request *req = io->req; - /* * We have handled UBLK_IO_NEED_GET_DATA command, * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just @@ -2187,6 +2193,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; int ret = -EINVAL; + struct request *req; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, @@ -2245,11 +2252,19 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, goto out; break; case UBLK_IO_NEED_GET_DATA: - io->addr = ub_cmd->addr; - if (!ublk_get_data(ubq, io)) - return -EIOCBQUEUED; - - return UBLK_IO_RES_OK; + /* + * ublk_get_data() may fail and fallback to requeue, so keep + * uring_cmd active first and prepare for handling new requeued + * request + */ + req = io->req; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + if (likely(ublk_get_data(ubq, io, req))) { + __ublk_prep_compl_io_cmd(io, req); + return UBLK_IO_RES_OK; + } + break; default: goto out; } From 5afb4bf9fc62d828647647ec31745083637132e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:33 +0100 Subject: [PATCH 207/297] io_uring/rsrc: fix folio unpinning syzbot complains about an unmapping failure: [ 108.070381][ T14] kernel BUG at mm/gup.c:71! [ 108.070502][ T14] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP [ 108.123672][ T14] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20250221-8.fc42 02/21/2025 [ 108.127458][ T14] Workqueue: iou_exit io_ring_exit_work [ 108.174205][ T14] Call trace: [ 108.175649][ T14] sanity_check_pinned_pages+0x7cc/0x7d0 (P) [ 108.178138][ T14] unpin_user_page+0x80/0x10c [ 108.180189][ T14] io_release_ubuf+0x84/0xf8 [ 108.182196][ T14] io_free_rsrc_node+0x250/0x57c [ 108.184345][ T14] io_rsrc_data_free+0x148/0x298 [ 108.186493][ T14] io_sqe_buffers_unregister+0x84/0xa0 [ 108.188991][ T14] io_ring_ctx_free+0x48/0x480 [ 108.191057][ T14] io_ring_exit_work+0x764/0x7d8 [ 108.193207][ T14] process_one_work+0x7e8/0x155c [ 108.195431][ T14] worker_thread+0x958/0xed8 [ 108.197561][ T14] kthread+0x5fc/0x75c [ 108.199362][ T14] ret_from_fork+0x10/0x20 We can pin a tail page of a folio, but then io_uring will try to unpin the head page of the folio. While it should be fine in terms of keeping the page actually alive, mm folks say it's wrong and triggers a debug warning. Use unpin_user_folio() instead of unpin_user_page*. Cc: stable@vger.kernel.org Debugged-by: David Hildenbrand Reported-by: syzbot+1d335893772467199ab6@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/683f1551.050a0220.55ceb.0017.GAE@google.com Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/a28b0f87339ac2acf14a645dad1e95bbcbf18acd.1750771718.git.asml.silence@gmail.com/ [axboe: adapt to current tree, massage commit message] Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d724602697e7..0c09e38784c9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -112,8 +112,11 @@ static void io_release_ubuf(void *priv) struct io_mapped_ubuf *imu = priv; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } } static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, @@ -840,8 +843,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (ret) { if (imu) io_free_imu(ctx, imu); - if (pages) - unpin_user_pages(pages, nr_pages); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } From 3a3c6d61577dbb23c09df3e21f6f9eda1ecd634b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:34 +0100 Subject: [PATCH 208/297] io_uring/rsrc: don't rely on user vaddr alignment There is no guaranteed alignment for user pointers, however the calculation of an offset of the first page into a folio after coalescing uses some weird bit mask logic, get rid of it. Cc: stable@vger.kernel.org Reported-by: David Hildenbrand Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/e387b4c78b33f231105a601d84eefd8301f57954.1750771718.git.asml.silence@gmail.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 7 ++++++- io_uring/rsrc.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0c09e38784c9..afc67530f912 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -734,6 +734,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have @@ -827,7 +828,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + node->buf = imu; ret = 0; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0d2138f16322..25e7e998dcfd 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -49,6 +49,7 @@ struct io_imu_folio_data { unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; + unsigned long first_folio_page_idx; }; bool io_rsrc_cache_init(struct io_ring_ctx *ctx); From e1d7727b73a1f78035316ac35ee184d477059f0b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:35 +0100 Subject: [PATCH 209/297] io_uring: don't assume uaddr alignment in io_vec_fill_bvec There is no guaranteed alignment for user pointers. Don't use mask trickery and adjust the offset by bv_offset. Cc: stable@vger.kernel.org Reported-by: David Hildenbrand Fixes: 9ef4cbbcb4ac3 ("io_uring: add infra for importing vectored reg buffers") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/19530391f5c361a026ac9b401ff8e123bde55d98.1750771718.git.asml.silence@gmail.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index afc67530f912..f2b31fb68992 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1339,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, { unsigned long folio_size = 1 << imu->folio_shift; unsigned long folio_mask = folio_size - 1; - u64 folio_addr = imu->ubuf & ~folio_mask; struct bio_vec *res_bvec = vec->bvec; size_t total_len = 0; unsigned bvec_idx = 0; @@ -1361,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW; - /* by using folio address it also accounts for bvec offset */ - offset = buf_addr - folio_addr; + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + src_bvec = imu->bvec + (offset >> imu->folio_shift); offset &= folio_mask; From 5e9571750c4e53d16727a04159455c693d7b31cb Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Tue, 24 Jun 2025 17:00:47 +0800 Subject: [PATCH 210/297] ALSA: usb: qcom: fix NULL pointer dereference in qmi_stop_session The find_substream() call may return NULL, but the error path dereferenced 'subs' unconditionally via dev_err(&subs->dev->dev, ...), causing a NULL pointer dereference when subs is NULL. Fix by switching to &uadev[idx].udev->dev which is always valid in this context. Signed-off-by: Pei Xiao Link: https://patch.msgid.link/86ac2939273ac853535049e60391c09d7688714e.1750755508.git.xiaopei01@kylinos.cn Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index 797afd4561bd..3543b5a53592 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -759,7 +759,7 @@ static void qmi_stop_session(void) subs = find_substream(pcm_card_num, info->pcm_dev_num, info->direction); if (!subs || !chip || atomic_read(&chip->shutdown)) { - dev_err(&subs->dev->dev, + dev_err(&uadev[idx].udev->dev, "no sub for c#%u dev#%u dir%u\n", info->pcm_card_num, info->pcm_dev_num, From d02b2103a08b6d6908f1d3d8e8783d3f342555ac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:18:18 +0200 Subject: [PATCH 211/297] drm/i915: fix build error some more An earlier patch fixed a build failure with clang, but I still see the same problem with some configurations using gcc: drivers/gpu/drm/i915/i915_pmu.c: In function 'config_mask': include/linux/compiler_types.h:568:38: error: call to '__compiletime_assert_462' declared with attribute error: BUILD_BUG_ON failed: bit > BITS_PER_TYPE(typeof_member(struct i915_pmu, enable)) - 1 drivers/gpu/drm/i915/i915_pmu.c:116:3: note: in expansion of macro 'BUILD_BUG_ON' 116 | BUILD_BUG_ON(bit > As I understand it, the problem is that the function is not always fully inlined, but the __builtin_constant_p() can still evaluate the argument as being constant. Marking it as __always_inline so far works for me in all configurations. Fixes: a7137b1825b5 ("drm/i915/pmu: Fix build error with GCOV and AutoFDO enabled") Fixes: a644fde77ff7 ("drm/i915/pmu: Change bitmask of enabled events to u32") Reviewed-by: Rodrigo Vivi Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250620111824.3395007-1-arnd@kernel.org Signed-off-by: Rodrigo Vivi (cherry picked from commit ef69f9dd1cd7301cdf04ba326ed28152a3affcf6) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 990bfaba3ce4..5bc696bfbb0f 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -108,7 +108,7 @@ static unsigned int config_bit(const u64 config) return other_bit(config); } -static u32 config_mask(const u64 config) +static __always_inline u32 config_mask(const u64 config) { unsigned int bit = config_bit(config); From c55c7a85e02a7bfee20a3ffebdff7cbeb41613ef Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:25 +0800 Subject: [PATCH 212/297] um: ubd: Add missing error check in start_io_thread() The subsequent call to os_set_fd_block() overwrites the previous return value. OR the two return values together to fix it. Fixes: f88f0bdfc32f ("um: UBD Improvements") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index c5e6545f6fcf..8e8a8bf518b6 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -41,7 +41,7 @@ int start_io_thread(struct os_helper_thread **td_out, int *fd_out) *fd_out = fds[1]; err = os_set_fd_block(*fd_out, 0); - err = os_set_fd_block(kernel_fd, 0); + err |= os_set_fd_block(kernel_fd, 0); if (err) { printk("start_io_thread - failed to set nonblocking I/O.\n"); goto out_close; From bc4e2ae08183d9017f43bf88aa7cfdf84e76f573 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:27 +0800 Subject: [PATCH 213/297] um: vfio: Prevent duplicate device assignments Ensure devices are assigned only once. Reject subsequent requests for duplicate assignments. Fixes: a0e2cb6a9063 ("um: Add VFIO-based virtual PCI driver") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vfio_kern.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c index b51fc9888ae1..13b971a2bd43 100644 --- a/arch/um/drivers/vfio_kern.c +++ b/arch/um/drivers/vfio_kern.c @@ -570,6 +570,17 @@ static void uml_vfio_release_device(struct uml_vfio_device *dev) kfree(dev); } +static struct uml_vfio_device *uml_vfio_find_device(const char *device) +{ + struct uml_vfio_device *dev; + + list_for_each_entry(dev, ¨_vfio_devices, list) { + if (!strcmp(dev->name, device)) + return dev; + } + return NULL; +} + static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) { struct uml_vfio_device *dev; @@ -582,6 +593,9 @@ static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *k uml_vfio_container.fd = fd; } + if (uml_vfio_find_device(device)) + return -EEXIST; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; From 8948941276024fcfb08fdb3d678867dc1f7b1c16 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:28 +0800 Subject: [PATCH 214/297] um: Use correct data source in fpregs_legacy_set() Read from the buffer pointed to by 'from' instead of '&buf', as 'buf' contains no valid data when 'ubuf' is NULL. Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/x86/um/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 3275870330fe..fae8aabad10f 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -161,7 +161,7 @@ static int fpregs_legacy_set(struct task_struct *target, from = kbuf; } - return um_fxsr_from_i387(fxsave, &buf); + return um_fxsr_from_i387(fxsave, from); } #endif From 2d65fc13be85c336c56af7077f08ccd3a3a15a4a Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 23 Jun 2025 19:08:29 +0800 Subject: [PATCH 215/297] um: vector: Reduce stack usage in vector_eth_configure() When compiling with clang (19.1.7), initializing *vp using a compound literal may result in excessive stack usage. Fix it by initializing the required fields of *vp individually. Without this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:1472 ... With this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:208 ... Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506221017.WtB7Usua-lkp@intel.com/ Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250623110829.314864-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 42 +++++++++++------------------------ 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index f292e0b4ff8b..9bbbddfe866b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1625,35 +1625,19 @@ static void vector_eth_configure( device->dev = dev; - *vp = ((struct vector_private) - { - .list = LIST_HEAD_INIT(vp->list), - .dev = dev, - .unit = n, - .options = get_transport_options(def), - .rx_irq = 0, - .tx_irq = 0, - .parsed = def, - .max_packet = get_mtu(def) + ETH_HEADER_OTHER, - /* TODO - we need to calculate headroom so that ip header - * is 16 byte aligned all the time - */ - .headroom = get_headroom(def), - .form_header = NULL, - .verify_header = NULL, - .header_rxbuffer = NULL, - .header_txbuffer = NULL, - .header_size = 0, - .rx_header_size = 0, - .rexmit_scheduled = false, - .opened = false, - .transport_data = NULL, - .in_write_poll = false, - .coalesce = 2, - .req_size = get_req_size(def), - .in_error = false, - .bpf = NULL - }); + INIT_LIST_HEAD(&vp->list); + vp->dev = dev; + vp->unit = n; + vp->options = get_transport_options(def); + vp->parsed = def; + vp->max_packet = get_mtu(def) + ETH_HEADER_OTHER; + /* + * TODO - we need to calculate headroom so that ip header + * is 16 byte aligned all the time + */ + vp->headroom = get_headroom(def); + vp->coalesce = 2; + vp->req_size = get_req_size(def); dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); INIT_WORK(&vp->reset_tx, vector_reset_tx); From 3e0809b1664b9dc650d9dbca9a2d3ac690d4f661 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 24 Jun 2025 09:40:30 +0200 Subject: [PATCH 216/297] ata: ahci: Use correct DMI identifier for ASUSPRO-D840SA LPM quirk ASUS store the board name in DMI_PRODUCT_NAME rather than DMI_PRODUCT_VERSION. (Apparently it is only Lenovo that stores the model-name in DMI_PRODUCT_VERSION.) Use the correct DMI identifier, DMI_PRODUCT_NAME, to match the ASUSPRO-D840SA board, such that the quirk actually gets applied. Cc: stable@vger.kernel.org Reported-by: Andy Yang Tested-by: Andy Yang Closes: https://lore.kernel.org/linux-ide/aFb3wXAwJSSJUB7o@ryzen/ Fixes: b5acc3628898 ("ata: ahci: Disallow LPM for ASUSPRO-D840SA motherboard") Reviewed-by: Hans de Goede Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250624074029.963028-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index e5e5c2e81d09..aa93b0ecbbc6 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1450,7 +1450,7 @@ static bool ahci_broken_lpm(struct pci_dev *pdev) { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_VERSION, "ASUSPRO D840MB_M840SA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ASUSPRO D840MB_M840SA"), }, /* 320 is broken, there is no known good version. */ }, From 7cac633a42a7b3c8146eb1db76fb80dc652998de Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Wed, 25 Jun 2025 03:27:03 -0700 Subject: [PATCH 217/297] io_uring: fix resource leak in io_import_dmabuf() Replace the return statement with setting ret = -EINVAL and jumping to the err label to ensure resources are released via io_release_dmabuf. Fixes: a5c98e942457 ("io_uring/zcrx: dmabuf backed zerocopy receive") Signed-off-by: Penglei Jiang Link: https://lore.kernel.org/r/20250625102703.68336-1-superman.xpt@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 21c816c3bfe0..ade4da9c4e31 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -106,8 +106,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq, for_each_sgtable_dma_sg(mem->sgt, sg, i) total_size += sg_dma_len(sg); - if (total_size < off + len) - return -EINVAL; + if (total_size < off + len) { + ret = -EINVAL; + goto err; + } mem->dmabuf_offset = off; mem->size = len; From a3f3040657417aeadb9622c629d4a0c2693a0f93 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Thu, 29 May 2025 20:50:04 +0000 Subject: [PATCH 218/297] EDAC/amd64: Fix size calculation for Non-Power-of-Two DIMMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each Chip-Select (CS) of a Unified Memory Controller (UMC) on AMD Zen-based SOCs has an Address Mask and a Secondary Address Mask register associated with it. The amd64_edac module logs DIMM sizes on a per-UMC per-CS granularity during init using these two registers. Currently, the module primarily considers only the Address Mask register for computing DIMM sizes. The Secondary Address Mask register is only considered for odd CS. Additionally, if it has been considered, the Address Mask register is ignored altogether for that CS. For power-of-two DIMMs i.e. DIMMs whose total capacity is a power of two (32GB, 64GB, etc), this is not an issue since only the Address Mask register is used. For non-power-of-two DIMMs i.e., DIMMs whose total capacity is not a power of two (48GB, 96GB, etc), however, the Secondary Address Mask register is used in conjunction with the Address Mask register. However, since the module only considers either of the two registers for a CS, the size computed by the module is incorrect. The Secondary Address Mask register is not considered for even CS, and the Address Mask register is not considered for odd CS. Introduce a new helper function so that both Address Mask and Secondary Address Mask registers are considered, when valid, for computing DIMM sizes. Furthermore, also rename some variables for greater clarity. Fixes: 81f5090db843 ("EDAC/amd64: Support asymmetric dual-rank DIMMs") Closes: https://lore.kernel.org/dbec22b6-00f2-498b-b70d-ab6f8a5ec87e@natrix.lt Reported-by: Žilvinas Žaltiena Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Tested-by: Žilvinas Žaltiena Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250529205013.403450-1-avadhut.naik@amd.com --- drivers/edac/amd64_edac.c | 57 ++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index b681c0663203..07f1e9dc1ca7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1209,7 +1209,9 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_PRIMARY; - /* Asymmetric dual-rank DIMM support. */ + if (csrow_sec_enabled(2 * dimm, ctrl, pvt)) + cs_mode |= CS_EVEN_SECONDARY; + if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_SECONDARY; @@ -1230,12 +1232,13 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } -static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, - int csrow_nr, int dimm) +static int calculate_cs_size(u32 mask, unsigned int cs_mode) { - u32 msb, weight, num_zero_bits; - u32 addr_mask_deinterleaved; - int size = 0; + int msb, weight, num_zero_bits; + u32 deinterleaved_mask; + + if (!mask) + return 0; /* * The number of zero bits in the mask is equal to the number of bits @@ -1248,19 +1251,30 @@ static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, * without swapping with the most significant bit. This can be handled * by keeping the MSB where it is and ignoring the single zero bit. */ - msb = fls(addr_mask_orig) - 1; - weight = hweight_long(addr_mask_orig); + msb = fls(mask) - 1; + weight = hweight_long(mask); num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); /* Take the number of zero bits off from the top of the mask. */ - addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); + deinterleaved_mask = GENMASK(msb - num_zero_bits, 1); + edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", deinterleaved_mask); + + return (deinterleaved_mask >> 2) + 1; +} + +static int __addr_mask_to_cs_size(u32 addr_mask, u32 addr_mask_sec, + unsigned int cs_mode, int csrow_nr, int dimm) +{ + int size; edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); - edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); - edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); + edac_dbg(1, " Primary AddrMask: 0x%x\n", addr_mask); /* Register [31:1] = Address [39:9]. Size is in kBs here. */ - size = (addr_mask_deinterleaved >> 2) + 1; + size = calculate_cs_size(addr_mask, cs_mode); + + edac_dbg(1, " Secondary AddrMask: 0x%x\n", addr_mask_sec); + size += calculate_cs_size(addr_mask_sec, cs_mode); /* Return size in MBs. */ return size >> 10; @@ -1269,8 +1283,8 @@ static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { + u32 addr_mask = 0, addr_mask_sec = 0; int cs_mask_nr = csrow_nr; - u32 addr_mask_orig; int dimm, size = 0; /* No Chip Selects are enabled. */ @@ -1308,13 +1322,13 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, if (!pvt->flags.zn_regs_v2) cs_mask_nr >>= 1; - /* Asymmetric dual-rank DIMM support. */ - if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) - addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; - else - addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; + if (cs_mode & (CS_EVEN_PRIMARY | CS_ODD_PRIMARY)) + addr_mask = pvt->csels[umc].csmasks[cs_mask_nr]; - return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, dimm); + if (cs_mode & (CS_EVEN_SECONDARY | CS_ODD_SECONDARY)) + addr_mask_sec = pvt->csels[umc].csmasks_sec[cs_mask_nr]; + + return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, dimm); } static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) @@ -3512,9 +3526,10 @@ static void gpu_get_err_info(struct mce *m, struct err_info *err) static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { - u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr]; + u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr]; + u32 addr_mask_sec = pvt->csels[umc].csmasks_sec[csrow_nr]; - return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1); + return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, csrow_nr >> 1); } static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) From 55e8ff842051b1150461d7595d8f1d033c69d66b Mon Sep 17 00:00:00 2001 From: Jayesh Choudhary Date: Tue, 24 Jun 2025 10:18:35 +0530 Subject: [PATCH 219/297] drm/bridge: ti-sn65dsi86: Add HPD for DisplayPort connector type By default, HPD was disabled on SN65DSI86 bridge. When the driver was added (commit "a095f15c00e27"), the HPD_DISABLE bit was set in pre-enable call which was moved to other function calls subsequently. Later on, commit "c312b0df3b13" added detect utility for DP mode. But with HPD_DISABLE bit set, all the HPD events are disabled[0] and the debounced state always return 1 (always connected state). Set HPD_DISABLE bit conditionally based on display sink's connector type. Since the HPD_STATE is reflected correctly only after waiting for debounce time (~100-400ms) and adding this delay in detect() is not feasible owing to the performace impact (glitches and frame drop), remove runtime calls in detect() and add hpd_enable()/disable() bridge hooks with runtime calls, to detect hpd properly without any delay. [0]: (Pg. 32) Fixes: c312b0df3b13 ("drm/bridge: ti-sn65dsi86: Implement bridge connector operations for DP") Cc: Max Krummenacher Reviewed-by: Douglas Anderson Tested-by: Ernest Van Hoecke Signed-off-by: Jayesh Choudhary Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20250624044835.165708-1-j-choudhary@ti.com --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 69 +++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 60224f476e1d..de9c23537465 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -348,12 +348,18 @@ static void ti_sn65dsi86_enable_comms(struct ti_sn65dsi86 *pdata, * 200 ms. We'll assume that the panel driver will have the hardcoded * delay in its prepare and always disable HPD. * - * If HPD somehow makes sense on some future panel we'll have to - * change this to be conditional on someone specifying that HPD should - * be used. + * For DisplayPort bridge type, we need HPD. So we use the bridge type + * to conditionally disable HPD. + * NOTE: The bridge type is set in ti_sn_bridge_probe() but enable_comms() + * can be called before. So for DisplayPort, HPD will be enabled once + * bridge type is set. We are using bridge type instead of "no-hpd" + * property because it is not used properly in devicetree description + * and hence is unreliable. */ - regmap_update_bits(pdata->regmap, SN_HPD_DISABLE_REG, HPD_DISABLE, - HPD_DISABLE); + + if (pdata->bridge.type != DRM_MODE_CONNECTOR_DisplayPort) + regmap_update_bits(pdata->regmap, SN_HPD_DISABLE_REG, HPD_DISABLE, + HPD_DISABLE); pdata->comms_enabled = true; @@ -1195,9 +1201,14 @@ static enum drm_connector_status ti_sn_bridge_detect(struct drm_bridge *bridge) struct ti_sn65dsi86 *pdata = bridge_to_ti_sn65dsi86(bridge); int val = 0; - pm_runtime_get_sync(pdata->dev); + /* + * Runtime reference is grabbed in ti_sn_bridge_hpd_enable() + * as the chip won't report HPD just after being powered on. + * HPD_DEBOUNCED_STATE reflects correct state only after the + * debounce time (~100-400 ms). + */ + regmap_read(pdata->regmap, SN_HPD_DISABLE_REG, &val); - pm_runtime_put_autosuspend(pdata->dev); return val & HPD_DEBOUNCED_STATE ? connector_status_connected : connector_status_disconnected; @@ -1220,6 +1231,26 @@ static void ti_sn65dsi86_debugfs_init(struct drm_bridge *bridge, struct dentry * debugfs_create_file("status", 0600, debugfs, pdata, &status_fops); } +static void ti_sn_bridge_hpd_enable(struct drm_bridge *bridge) +{ + struct ti_sn65dsi86 *pdata = bridge_to_ti_sn65dsi86(bridge); + + /* + * Device needs to be powered on before reading the HPD state + * for reliable hpd detection in ti_sn_bridge_detect() due to + * the high debounce time. + */ + + pm_runtime_get_sync(pdata->dev); +} + +static void ti_sn_bridge_hpd_disable(struct drm_bridge *bridge) +{ + struct ti_sn65dsi86 *pdata = bridge_to_ti_sn65dsi86(bridge); + + pm_runtime_put_autosuspend(pdata->dev); +} + static const struct drm_bridge_funcs ti_sn_bridge_funcs = { .attach = ti_sn_bridge_attach, .detach = ti_sn_bridge_detach, @@ -1234,6 +1265,8 @@ static const struct drm_bridge_funcs ti_sn_bridge_funcs = { .atomic_duplicate_state = drm_atomic_helper_bridge_duplicate_state, .atomic_destroy_state = drm_atomic_helper_bridge_destroy_state, .debugfs_init = ti_sn65dsi86_debugfs_init, + .hpd_enable = ti_sn_bridge_hpd_enable, + .hpd_disable = ti_sn_bridge_hpd_disable, }; static void ti_sn_bridge_parse_lanes(struct ti_sn65dsi86 *pdata, @@ -1321,8 +1354,26 @@ static int ti_sn_bridge_probe(struct auxiliary_device *adev, pdata->bridge.type = pdata->next_bridge->type == DRM_MODE_CONNECTOR_DisplayPort ? DRM_MODE_CONNECTOR_DisplayPort : DRM_MODE_CONNECTOR_eDP; - if (pdata->bridge.type == DRM_MODE_CONNECTOR_DisplayPort) - pdata->bridge.ops = DRM_BRIDGE_OP_EDID | DRM_BRIDGE_OP_DETECT; + if (pdata->bridge.type == DRM_MODE_CONNECTOR_DisplayPort) { + pdata->bridge.ops = DRM_BRIDGE_OP_EDID | DRM_BRIDGE_OP_DETECT | + DRM_BRIDGE_OP_HPD; + /* + * If comms were already enabled they would have been enabled + * with the wrong value of HPD_DISABLE. Update it now. Comms + * could be enabled if anyone is holding a pm_runtime reference + * (like if a GPIO is in use). Note that in most cases nobody + * is doing AUX channel xfers before the bridge is added so + * HPD doesn't _really_ matter then. The only exception is in + * the eDP case where the panel wants to read the EDID before + * the bridge is added. We always consistently have HPD disabled + * for eDP. + */ + mutex_lock(&pdata->comms_mutex); + if (pdata->comms_enabled) + regmap_update_bits(pdata->regmap, SN_HPD_DISABLE_REG, + HPD_DISABLE, 0); + mutex_unlock(&pdata->comms_mutex); + }; drm_bridge_add(&pdata->bridge); From 1944f6ab4967db7ad8d4db527dceae8c77de76e9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 25 Jun 2025 10:16:38 +0200 Subject: [PATCH 220/297] smb: client: let smbd_post_send_iter() respect the peers max_send_size and transmit all data We should not send smbdirect_data_transfer messages larger than the negotiated max_send_size, typically 1364 bytes, which means 24 bytes of the smbdirect_data_transfer header + 1340 payload bytes. This happened when doing an SMB2 write with more than 1340 bytes (which is done inline as it's below rdma_readwrite_threshold). It means the peer resets the connection. When testing between cifs.ko and ksmbd.ko something like this is logged: client: CIFS: VFS: RDMA transport re-established siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 siw: got TERMINATE. layer 1, type 2, code 2 CIFS: VFS: \\carina Send error in SessSetup = -11 smb2_reconnect: 12 callbacks suppressed CIFS: VFS: reconnect tcon failed rc = -11 CIFS: VFS: reconnect tcon failed rc = -11 CIFS: VFS: reconnect tcon failed rc = -11 CIFS: VFS: SMB: Zero rsize calculated, using minimum value 65536 and: CIFS: VFS: RDMA transport re-established siw: got TERMINATE. layer 1, type 2, code 2 CIFS: VFS: smbd_recv:1894 disconnected siw: got TERMINATE. layer 1, type 2, code 2 The ksmbd dmesg is showing things like: smb_direct: Recv error. status='local length error (1)' opcode=128 smb_direct: disconnected smb_direct: Recv error. status='local length error (1)' opcode=128 ksmbd: smb_direct: disconnected ksmbd: sock_read failed: -107 As smbd_post_send_iter() limits the transmitted number of bytes we need loop over it in order to transmit the whole iter. Reviewed-by: David Howells Tested-by: David Howells Tested-by: Meetakshi Setiya Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: # sp->max_send_size should be info->max_send_size in backports Fixes: 3d78fe73fa12 ("cifs: Build the RDMA SGE list directly from an iterator") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index cbc85bca006f..a976bcf61226 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -907,8 +907,10 @@ static int smbd_post_send_iter(struct smbd_connection *info, .local_dma_lkey = sc->ib.pd->local_dma_lkey, .direction = DMA_TO_DEVICE, }; + size_t payload_len = umin(*_remaining_data_length, + sp->max_send_size - sizeof(*packet)); - rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + rc = smb_extract_iter_to_rdma(iter, payload_len, &extract); if (rc < 0) goto err_dma; @@ -1013,6 +1015,27 @@ static int smbd_post_send_empty(struct smbd_connection *info) return smbd_post_send_iter(info, NULL, &remaining_data_length); } +static int smbd_post_send_full_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) +{ + int rc = 0; + + /* + * smbd_post_send_iter() respects the + * negotiated max_send_size, so we need to + * loop until the full iter is posted + */ + + while (iov_iter_count(iter) > 0) { + rc = smbd_post_send_iter(info, iter, _remaining_data_length); + if (rc < 0) + break; + } + + return rc; +} + /* * Post a receive request to the transport * The remote peer can only send data when a receive request is posted @@ -2032,14 +2055,14 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_iter(info, &rqst->rq_iter, - &remaining_data_length); + rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + &remaining_data_length); if (rc < 0) break; } From 9a709b7e98e6fa51600b5f2d24c5068efa6d39de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Jun 2025 10:17:06 -0600 Subject: [PATCH 221/297] io_uring/net: mark iov as dynamically allocated even for single segments A bigger array of vecs could've been allocated, but io_ring_buffers_peek() still decided to cap the mapped range depending on how much data was available. Hence don't rely on the segment count to know if the request should be marked as needing cleanup, always check upfront if the iov array is different than the fast_iov array. Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") Signed-off-by: Jens Axboe --- io_uring/net.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 9550d4c8f866..5c1e8c4ba468 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1077,6 +1077,12 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1085,11 +1091,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { - kmsg->vec.nr = ret; - kmsg->vec.iovec = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; From e97f9540ce001503a4539f337da742c1dfa7d86a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 25 Jun 2025 10:13:04 +0200 Subject: [PATCH 222/297] smb: client: remove \t from TP_printk statements The generate '[FAILED TO PARSE]' strings in trace-cmd report output like this: rm-5298 [001] 6084.533748493: smb3_exit_err: [FAILED TO PARSE] xid=972 func_name=cifs_rmdir rc=-39 rm-5298 [001] 6084.533959234: smb3_enter: [FAILED TO PARSE] xid=973 func_name=cifs_closedir rm-5298 [001] 6084.533967630: smb3_close_enter: [FAILED TO PARSE] xid=973 fid=94489281833 tid=1 sesid=96758029877361 rm-5298 [001] 6084.534004008: smb3_cmd_enter: [FAILED TO PARSE] tid=1 sesid=96758029877361 cmd=6 mid=566 rm-5298 [001] 6084.552248232: smb3_cmd_done: [FAILED TO PARSE] tid=1 sesid=96758029877361 cmd=6 mid=566 rm-5298 [001] 6084.552280542: smb3_close_done: [FAILED TO PARSE] xid=973 fid=94489281833 tid=1 sesid=96758029877361 rm-5298 [001] 6084.552316034: smb3_exit_done: [FAILED TO PARSE] xid=973 func_name=cifs_closedir Cc: stable@vger.kernel.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/trace.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 52bcb55d9952..93e5b2bb9f28 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->rreq_debug_id, __entry->rreq_debug_index, __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) @@ -190,7 +190,7 @@ DECLARE_EVENT_CLASS(smb3_other_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) ) @@ -247,7 +247,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) ) @@ -298,7 +298,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_done_class, __entry->target_offset = target_offset; __entry->len = len; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) ) @@ -482,7 +482,7 @@ DECLARE_EVENT_CLASS(smb3_fd_class, __entry->tid = tid; __entry->sesid = sesid; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx", __entry->xid, __entry->sesid, __entry->tid, __entry->fid) ) @@ -521,7 +521,7 @@ DECLARE_EVENT_CLASS(smb3_fd_err_class, __entry->sesid = sesid; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->rc) ) @@ -794,7 +794,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_err_class, __entry->status = status; __entry->rc = rc; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid, __entry->status, __entry->rc) ) @@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_done_class, __entry->cmd = cmd; __entry->mid = mid; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid) ) @@ -867,7 +867,7 @@ DECLARE_EVENT_CLASS(smb3_mid_class, __entry->when_sent = when_sent; __entry->when_received = when_received; ), - TP_printk("\tcmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", + TP_printk("cmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", __entry->cmd, __entry->mid, __entry->pid, __entry->when_sent, __entry->when_received) ) @@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, __assign_str(func_name); __entry->rc = rc; ), - TP_printk("\t%s: xid=%u rc=%d", + TP_printk("%s: xid=%u rc=%d", __get_str(func_name), __entry->xid, __entry->rc) ) @@ -924,7 +924,7 @@ DECLARE_EVENT_CLASS(smb3_sync_err_class, __entry->ino = ino; __entry->rc = rc; ), - TP_printk("\tino=%lu rc=%d", + TP_printk("ino=%lu rc=%d", __entry->ino, __entry->rc) ) @@ -950,7 +950,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, __entry->xid = xid; __assign_str(func_name); ), - TP_printk("\t%s: xid=%u", + TP_printk("%s: xid=%u", __get_str(func_name), __entry->xid) ) From 0a46f60a9fe16f5596b6b4b3ee1a483ea7854136 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 20 Jun 2025 13:29:24 +0800 Subject: [PATCH 223/297] cxl/edac: Fix using wrong repair type to check dram event record cxl_find_rec_dram() is used to find a DRAM event record based on the inputted attributes. Different repair_type of the inputted attributes will check the DRAM event record in different ways. When EDAC driver is performing a memory rank sparing, it should use CXL_RANK_SPARING rather than CXL_BANK_SPARING as repair_type for DRAM event record checking. Fixes: 588ca944c277 ("cxl/edac: Add CXL memory device memory sparing control feature") Signed-off-by: Li Ming Reviewed-by: Shiju Jose Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Fan Ni Link: https://patch.msgid.link/20250620052924.138892-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c index d725ee954199..623aaa4439c4 100644 --- a/drivers/cxl/core/edac.c +++ b/drivers/cxl/core/edac.c @@ -1323,7 +1323,7 @@ cxl_mem_get_rec_dram(struct cxl_memdev *cxlmd, attrbs.bank = ctx->bank; break; case EDAC_REPAIR_RANK_SPARING: - attrbs.repair_type = CXL_BANK_SPARING; + attrbs.repair_type = CXL_RANK_SPARING; break; default: return NULL; From fa6f092cc0a02d0fcee37e9e8172eda372a03d33 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 24 Jun 2025 22:02:15 -0700 Subject: [PATCH 224/297] libbpf: Fix possible use-after-free for externs The `name` field in `obj->externs` points into the BTF data at initial open time. However, some functions may invalidate this after opening and before loading (e.g. `bpf_map__set_value_size`), which results in pointers into freed memory and undefined behavior. The simplest solution is to simply `strdup` these strings, similar to the `essent_name`, and free them at the same time. In order to test this path, the `global_map_resize` BPF selftest is modified slightly to ensure the presence of an extern, which causes this test to fail prior to the fix. Given there isn't an obvious API or error to test against, I opted to add this to the existing test as an aspect of the resizing feature rather than duplicate the test. Fixes: 9d0a23313b1a ("libbpf: Add capability for resizing datasec maps") Signed-off-by: Adin Scannell Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250625050215.2777374-1-amscanne@meta.com --- tools/lib/bpf/libbpf.c | 10 +++++++--- .../selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e9c641a2fb20..52e353368f58 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -597,7 +597,7 @@ struct extern_desc { int sym_idx; int btf_id; int sec_btf_id; - const char *name; + char *name; char *essent_name; bool is_set; bool is_weak; @@ -4259,7 +4259,9 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return ext->btf_id; } t = btf__type_by_id(obj->btf, ext->btf_id); - ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->name = strdup(btf__name_by_offset(obj->btf, t->name_off)); + if (!ext->name) + return -ENOMEM; ext->sym_idx = i; ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; @@ -9138,8 +9140,10 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->btf_custom_path); zfree(&obj->kconfig); - for (i = 0; i < obj->nr_extern; i++) + for (i = 0; i < obj->nr_extern; i++) { + zfree(&obj->externs[i].name); zfree(&obj->externs[i].essent_name); + } zfree(&obj->externs); obj->nr_extern = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c index a3f220ba7025..ee65bad0436d 100644 --- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c +++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c @@ -32,6 +32,16 @@ int my_int_last SEC(".data.array_not_last"); int percpu_arr[1] SEC(".data.percpu_arr"); +/* at least one extern is included, to ensure that a specific + * regression is tested whereby resizing resulted in a free-after-use + * bug after type information is invalidated by the resize operation. + * + * There isn't a particularly good API to test for this specific condition, + * but by having externs for the resizing tests it will cover this path. + */ +extern int LINUX_KERNEL_VERSION __kconfig; +long version_sink; + SEC("tp/syscalls/sys_enter_getpid") int bss_array_sum(void *ctx) { @@ -44,6 +54,9 @@ int bss_array_sum(void *ctx) for (size_t i = 0; i < bss_array_len; ++i) sum += array[i]; + /* see above; ensure this is not optimized out */ + version_sink = LINUX_KERNEL_VERSION; + return 0; } @@ -59,6 +72,9 @@ int data_array_sum(void *ctx) for (size_t i = 0; i < data_array_len; ++i) sum += my_array[i]; + /* see above; ensure this is not optimized out */ + version_sink = LINUX_KERNEL_VERSION; + return 0; } From a5d0b9e32745277644cda8d7d334e7080bd339bf Mon Sep 17 00:00:00 2001 From: Lukasz Kucharczyk Date: Tue, 20 May 2025 14:22:52 +0200 Subject: [PATCH 225/297] i2c: imx: fix emulated smbus block read Acknowledge the byte count submitted by the target. When I2C_SMBUS_BLOCK_DATA read operation is executed by i2c_smbus_xfer_emulated(), the length of the second (read) message is set to 1. Length of the block is supposed to be obtained from the target by the underlying bus driver. The i2c_imx_isr_read() function should emit the acknowledge on i2c bus after reading the first byte (i.e., byte count) while processing such message (as defined in Section 6.5.7 of System Management Bus Specification [1]). Without this acknowledge, the target does not submit subsequent bytes and the controller only reads 0xff's. In addition, store the length of block data obtained from the target in the buffer provided by i2c_smbus_xfer_emulated() - otherwise the first byte of actual data is erroneously interpreted as length of the data block. [1] https://smbus.org/specs/SMBus_3_3_20240512.pdf Fixes: 5f5c2d4579ca ("i2c: imx: prevent rescheduling in non dma mode") Signed-off-by: Lukasz Kucharczyk Cc: # v6.13+ Acked-by: Oleksij Rempel Reviewed-by: Stefan Eichenberger Reviewed-by: Carlos Song Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250520122252.1475403-1-lukasz.kucharczyk@leica-geosystems.com --- drivers/i2c/busses/i2c-imx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index e5732b0557fb..205cc132fdec 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1008,7 +1008,7 @@ static inline int i2c_imx_isr_read(struct imx_i2c_struct *i2c_imx) /* setup bus to read data */ temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); temp &= ~I2CR_MTX; - if (i2c_imx->msg->len - 1) + if ((i2c_imx->msg->len - 1) || (i2c_imx->msg->flags & I2C_M_RECV_LEN)) temp &= ~I2CR_TXAK; imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); @@ -1063,6 +1063,7 @@ static inline void i2c_imx_isr_read_block_data_len(struct imx_i2c_struct *i2c_im wake_up(&i2c_imx->queue); } i2c_imx->msg->len += len; + i2c_imx->msg->buf[i2c_imx->msg_buf_idx++] = len; } static irqreturn_t i2c_imx_master_isr(struct imx_i2c_struct *i2c_imx, unsigned int status) From 56ad91c1aa9c18064348edf69308080b03c9dc48 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 22 May 2025 08:42:35 +0200 Subject: [PATCH 226/297] i2c: robotfuzz-osif: disable zero-length read messages This driver passes the length of an i2c_msg directly to usb_control_msg(). If the message is now a read and of length 0, it violates the USB protocol and a warning will be printed. Enable the I2C_AQ_NO_ZERO_LEN_READ quirk for this adapter thus forbidding 0-length read messages altogether. Fixes: 83e53a8f120f ("i2c: Add bus driver for for OSIF USB i2c device.") Signed-off-by: Wolfram Sang Cc: # v3.14+ Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250522064234.3721-2-wsa+renesas@sang-engineering.com --- drivers/i2c/busses/i2c-robotfuzz-osif.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/busses/i2c-robotfuzz-osif.c b/drivers/i2c/busses/i2c-robotfuzz-osif.c index 80d45079b763..e0a76fb5bc31 100644 --- a/drivers/i2c/busses/i2c-robotfuzz-osif.c +++ b/drivers/i2c/busses/i2c-robotfuzz-osif.c @@ -111,6 +111,11 @@ static u32 osif_func(struct i2c_adapter *adapter) return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; } +/* prevent invalid 0-length usb_control_msg */ +static const struct i2c_adapter_quirks osif_quirks = { + .flags = I2C_AQ_NO_ZERO_LEN_READ, +}; + static const struct i2c_algorithm osif_algorithm = { .xfer = osif_xfer, .functionality = osif_func, @@ -143,6 +148,7 @@ static int osif_probe(struct usb_interface *interface, priv->adapter.owner = THIS_MODULE; priv->adapter.class = I2C_CLASS_HWMON; + priv->adapter.quirks = &osif_quirks; priv->adapter.algo = &osif_algorithm; priv->adapter.algo_data = priv; snprintf(priv->adapter.name, sizeof(priv->adapter.name), From cbdb25ccf7566eee0c2b945e35cb98baf9ed0aa6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 22 May 2025 08:43:49 +0200 Subject: [PATCH 227/297] i2c: tiny-usb: disable zero-length read messages This driver passes the length of an i2c_msg directly to usb_control_msg(). If the message is now a read and of length 0, it violates the USB protocol and a warning will be printed. Enable the I2C_AQ_NO_ZERO_LEN_READ quirk for this adapter thus forbidding 0-length read messages altogether. Fixes: e8c76eed2ecd ("i2c: New i2c-tiny-usb bus driver") Signed-off-by: Wolfram Sang Cc: # v2.6.22+ Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250522064349.3823-2-wsa+renesas@sang-engineering.com --- drivers/i2c/busses/i2c-tiny-usb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/busses/i2c-tiny-usb.c b/drivers/i2c/busses/i2c-tiny-usb.c index a18eab0992a1..57dfe5f1a7d9 100644 --- a/drivers/i2c/busses/i2c-tiny-usb.c +++ b/drivers/i2c/busses/i2c-tiny-usb.c @@ -139,6 +139,11 @@ static u32 usb_func(struct i2c_adapter *adapter) return ret; } +/* prevent invalid 0-length usb_control_msg */ +static const struct i2c_adapter_quirks usb_quirks = { + .flags = I2C_AQ_NO_ZERO_LEN_READ, +}; + /* This is the actual algorithm we define */ static const struct i2c_algorithm usb_algorithm = { .xfer = usb_xfer, @@ -247,6 +252,7 @@ static int i2c_tiny_usb_probe(struct usb_interface *interface, /* setup i2c adapter description */ dev->adapter.owner = THIS_MODULE; dev->adapter.class = I2C_CLASS_HWMON; + dev->adapter.quirks = &usb_quirks; dev->adapter.algo = &usb_algorithm; dev->adapter.algo_data = dev; snprintf(dev->adapter.name, sizeof(dev->adapter.name), From 942e1aece13e4007656e41d3182c0adf05cf08b8 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Mon, 9 Jun 2025 11:53:55 -0400 Subject: [PATCH 228/297] i2c: designware: Initialize adapter name only when not set Check if the adapter name is already set in the driver prior to initializing with generic name in i2c_dw_probe_master(). This check allows to retain the unique adapter name driver has initialized, which platform driver can use to distinguish it from other i2c designware adapters. Tested-by: Randy Dunlap Signed-off-by: Pratap Nirujogi Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250609155601.1477055-2-pratap.nirujogi@amd.com --- drivers/i2c/busses/i2c-designware-master.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index c5394229b77f..9d7d9e47564a 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -1042,8 +1042,9 @@ int i2c_dw_probe_master(struct dw_i2c_dev *dev) if (ret) return ret; - snprintf(adap->name, sizeof(adap->name), - "Synopsys DesignWare I2C adapter"); + if (!adap->name[0]) + scnprintf(adap->name, sizeof(adap->name), + "Synopsys DesignWare I2C adapter"); adap->retries = 3; adap->algo = &i2c_dw_algo; adap->quirks = &i2c_dw_quirks; From c8dc579169738a3546f57ecb38e62d3872a3cc04 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Mon, 9 Jun 2025 11:53:56 -0400 Subject: [PATCH 229/297] i2c: amd-isp: Initialize unique adapter name Initialize unique name for amdisp i2c adapter, which is used in the platform driver to detect the matching adapter for i2c_client creation. Add definition of amdisp i2c adapter name in a new header file (include/linux/soc/amd/isp4_misc.h) as it is referred in different driver modules. Tested-by: Randy Dunlap Signed-off-by: Pratap Nirujogi Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250609155601.1477055-3-pratap.nirujogi@amd.com --- MAINTAINERS | 1 + drivers/i2c/busses/i2c-designware-amdisp.c | 2 ++ include/linux/soc/amd/isp4_misc.h | 12 ++++++++++++ 3 files changed, 15 insertions(+) create mode 100644 include/linux/soc/amd/isp4_misc.h diff --git a/MAINTAINERS b/MAINTAINERS index c3f7fbd0d67a..8719f097aae3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24063,6 +24063,7 @@ M: Bin Du L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-designware-amdisp.c +F: include/linux/soc/amd/isp4_misc.h SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER M: Jaehoon Chung diff --git a/drivers/i2c/busses/i2c-designware-amdisp.c b/drivers/i2c/busses/i2c-designware-amdisp.c index ad6f08338124..450793d5f839 100644 --- a/drivers/i2c/busses/i2c-designware-amdisp.c +++ b/drivers/i2c/busses/i2c-designware-amdisp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "i2c-designware-core.h" @@ -62,6 +63,7 @@ static int amd_isp_dw_i2c_plat_probe(struct platform_device *pdev) adap = &isp_i2c_dev->adapter; adap->owner = THIS_MODULE; + scnprintf(adap->name, sizeof(adap->name), AMDISP_I2C_ADAP_NAME); ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev)); adap->dev.of_node = pdev->dev.of_node; /* use dynamically allocated adapter id */ diff --git a/include/linux/soc/amd/isp4_misc.h b/include/linux/soc/amd/isp4_misc.h new file mode 100644 index 000000000000..6738796986a7 --- /dev/null +++ b/include/linux/soc/amd/isp4_misc.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __SOC_ISP4_MISC_H +#define __SOC_ISP4_MISC_H + +#define AMDISP_I2C_ADAP_NAME "AMDISP DesignWare I2C adapter" + +#endif From 577c1e0ef351e41aba764816232a9feb7a9b3969 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Mon, 9 Jun 2025 11:53:57 -0400 Subject: [PATCH 230/297] platform/x86: Use i2c adapter name to fix build errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use adapater->name inplace of adapter->owner->name to fix build issues when CONFIG_MODULES is not defined. Fixes: 90b85567e457 ("platform/x86: Add AMD ISP platform config for OV05C10") Reported-by: Randy Dunlap Link: https://lore.kernel.org/all/04577a46-9add-420c-b181-29bad582026d@infradead.org Tested-by: Randy Dunlap Signed-off-by: Pratap Nirujogi Requires: 942e1aece13e ("i2c: designware: Initialize adapter name only when not set" Requires: c8dc57916973 ("i2c: amd-isp: Initialize unique adapter name") Acked-by: Ilpo Järvinen Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250609155601.1477055-4-pratap.nirujogi@amd.com --- drivers/platform/x86/amd/amd_isp4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/amd_isp4.c b/drivers/platform/x86/amd/amd_isp4.c index 0cc01441bcbb..9f291aeb35f1 100644 --- a/drivers/platform/x86/amd/amd_isp4.c +++ b/drivers/platform/x86/amd/amd_isp4.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -151,7 +152,7 @@ MODULE_DEVICE_TABLE(acpi, amdisp_sensor_ids); static inline bool is_isp_i2c_adapter(struct i2c_adapter *adap) { - return !strcmp(adap->owner->name, "i2c_designware_amdisp"); + return !strcmp(adap->name, AMDISP_I2C_ADAP_NAME); } static void instantiate_isp_i2c_client(struct amdisp_platform *isp4_platform, From 666c23af755dccca8c25b5d5200ca28153c69a05 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 14 Jun 2025 16:59:26 +0200 Subject: [PATCH 231/297] i2c: omap: Fix an error handling path in omap_i2c_probe() If an error occurs after calling mux_state_select(), mux_state_deselect() should be called as already done in the remove function. Fixes: b6ef830c60b6 ("i2c: omap: Add support for setting mux") Signed-off-by: Christophe JAILLET Cc: # v6.15+ Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/998542981b6d2435c057dd8b9fe71743927babab.1749913149.git.christophe.jaillet@wanadoo.fr --- drivers/i2c/busses/i2c-omap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c index f1cc26ac5b80..8b01df3cc8e9 100644 --- a/drivers/i2c/busses/i2c-omap.c +++ b/drivers/i2c/busses/i2c-omap.c @@ -1461,13 +1461,13 @@ omap_i2c_probe(struct platform_device *pdev) if (IS_ERR(mux_state)) { r = PTR_ERR(mux_state); dev_dbg(&pdev->dev, "failed to get I2C mux: %d\n", r); - goto err_disable_pm; + goto err_put_pm; } omap->mux_state = mux_state; r = mux_state_select(omap->mux_state); if (r) { dev_err(&pdev->dev, "failed to select I2C mux: %d\n", r); - goto err_disable_pm; + goto err_put_pm; } } @@ -1515,6 +1515,9 @@ omap_i2c_probe(struct platform_device *pdev) err_unuse_clocks: omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0); + if (omap->mux_state) + mux_state_deselect(omap->mux_state); +err_put_pm: pm_runtime_dont_use_autosuspend(omap->dev); pm_runtime_put_sync(omap->dev); err_disable_pm: From 5e9388f7984a9cc7e659a105113f6ccf0aebedd0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 25 Jun 2025 17:03:55 -0400 Subject: [PATCH 232/297] selftests/bpf: adapt one more case in test_lru_map to the new target_free The below commit that updated BPF_MAP_TYPE_LRU_HASH free target, also updated tools/testing/selftests/bpf/test_lru_map to match. But that missed one case that passes with 4 cores, but fails at higher cpu counts. Update test_lru_sanity3 to also adjust its expectation of target_free. This time tested with 1, 4, 16, 64 and 384 cpu count. Fixes: d4adf1c9ee77 ("bpf: Adjust free target to avoid global starvation of LRU map") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20250625210412.2732970-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_lru_map.c | 33 ++++++++++++---------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 4ae83f4b7fc7..0921939532c6 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try) return ret; } +/* Derive target_free from map_size, same as bpf_common_lru_populate */ +static unsigned int __tgt_size(unsigned int map_size) +{ + return (map_size / nr_cpus) / 2; +} + /* Inverse of how bpf_common_lru_populate derives target_free from map_size. */ static unsigned int __map_size(unsigned int tgt_free) { @@ -410,12 +416,12 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) printf("Pass\n"); } -/* Size of the LRU map is 2*tgt_free - * It is to test the active/inactive list rotation - * Insert 1 to 2*tgt_free (+2*tgt_free keys) - * Lookup key 1 to tgt_free*3/2 - * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys) - * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU +/* Test the active/inactive list rotation + * + * Fill the whole map, deplete the free list. + * Reference all except the last lru->target_free elements. + * Insert lru->target_free new elements. This triggers one shrink. + * Verify that the non-referenced elements are replaced. */ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) { @@ -434,8 +440,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) assert(sched_next_online(0, &next_cpu) != -1); - batch_size = tgt_free / 2; - assert(batch_size * 2 == tgt_free); + batch_size = __tgt_size(tgt_free); map_size = tgt_free * 2; lru_map_fd = create_map(map_type, map_flags, map_size); @@ -446,23 +451,21 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */ - end_key = 1 + (2 * tgt_free); + /* Fill the map */ + end_key = 1 + map_size; for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Lookup key 1 to tgt_free*3/2 */ - end_key = tgt_free + batch_size; + /* Reference all but the last batch_size */ + end_key = 1 + map_size - batch_size; for (key = 1; key < end_key; key++) { assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value)); assert(!bpf_map_update_elem(expected_map_fd, &key, value, BPF_NOEXIST)); } - /* Add 1+2*tgt_free to tgt_free*5/2 - * (+tgt_free/2 keys) - */ + /* Insert new batch_size: replaces the non-referenced elements */ key = 2 * tgt_free + 1; end_key = key + batch_size; for (; key < end_key; key++) { From c4890963350dcf4e9a909bae23665921fba4ad27 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Tue, 24 Jun 2025 08:41:47 +0200 Subject: [PATCH 233/297] atm: idt77252: Add missing `dma_map_error()` The DMA map functions can fail and should be tested for errors. Signed-off-by: Thomas Fourier Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250624064148.12815-3-fourier.thomas@gmail.com Signed-off-by: Jakub Kicinski --- drivers/atm/idt77252.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 1206ab764ba9..f2e91b7d79f0 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -852,6 +852,8 @@ queue_skb(struct idt77252_dev *card, struct vc_map *vc, IDT77252_PRV_PADDR(skb) = dma_map_single(&card->pcidev->dev, skb->data, skb->len, DMA_TO_DEVICE); + if (dma_mapping_error(&card->pcidev->dev, IDT77252_PRV_PADDR(skb))) + return -ENOMEM; error = -EINVAL; @@ -1857,6 +1859,8 @@ add_rx_skb(struct idt77252_dev *card, int queue, paddr = dma_map_single(&card->pcidev->dev, skb->data, skb_end_pointer(skb) - skb->data, DMA_FROM_DEVICE); + if (dma_mapping_error(&card->pcidev->dev, paddr)) + goto outpoolrm; IDT77252_PRV_PADDR(skb) = paddr; if (push_rx_skb(card, skb, queue)) { @@ -1871,6 +1875,7 @@ add_rx_skb(struct idt77252_dev *card, int queue, dma_unmap_single(&card->pcidev->dev, IDT77252_PRV_PADDR(skb), skb_end_pointer(skb) - skb->data, DMA_FROM_DEVICE); +outpoolrm: handle = IDT77252_PRV_POOL(skb); card->sbpool[POOL_QUEUE(handle)].skb[POOL_INDEX(handle)] = NULL; From 7b515f35a911fdc31fbde6531828dcd6ae9803d3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 24 Jun 2025 17:35:12 +0100 Subject: [PATCH 234/297] net: enetc: Correct endianness handling in _enetc_rd_reg64 enetc_hw.h provides two versions of _enetc_rd_reg64. One which simply calls ioread64() when available. And another that composes the 64-bit result from ioread32() calls. In the second case the code appears to assume that each ioread32() call returns a little-endian value. However both the shift and logical or used to compose the return value would not work correctly on big endian systems if this were the case. Moreover, this is inconsistent with the first case where the return value of ioread64() is assumed to be in host byte order. It appears that the correct approach is for both versions to treat the return value of ioread*() functions as being in host byte order. And this patch corrects the ioread32()-based version to do so. This is a bug but would only manifest on big endian systems that make use of the ioread32-based implementation of _enetc_rd_reg64. While all in-tree users of this driver are little endian and make use of the ioread64-based implementation of _enetc_rd_reg64. Thus, no in-tree user of this driver is affected by this bug. Flagged by Sparse. Compile tested only. Fixes: 16eb4c85c964 ("enetc: Add ethtool statistics") Closes: https://lore.kernel.org/all/AM9PR04MB850500D3FC24FE23DEFCEA158879A@AM9PR04MB8505.eurprd04.prod.outlook.com/ Signed-off-by: Simon Horman Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250624-etnetc-le-v1-1-a73a95d96e4e@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 4098f01479bc..53e8d18c7a34 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -507,7 +507,7 @@ static inline u64 _enetc_rd_reg64(void __iomem *reg) tmp = ioread32(reg + 4); } while (high != tmp); - return le64_to_cpu((__le64)high << 32 | low); + return (u64)high << 32 | low; } #endif From 2434ccb94dfc8e036c81e9d7c77fca6dc1dc2590 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:53 -0700 Subject: [PATCH 235/297] netlink: specs: nfsd: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 13727f85b49b ("NFSD: introduce netlink stubs") Reviewed-by: Donald Hunter Reviewed-by: Jeff Layton Link: https://patch.msgid.link/20250624211002.3475021-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/nfsd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index c87658114852..8d1a3c01708f 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -27,7 +27,7 @@ attribute-sets: name: proc type: u32 - - name: service_time + name: service-time type: s64 - name: pad @@ -139,7 +139,7 @@ operations: - prog - version - proc - - service_time + - service-time - saddr4 - daddr4 - saddr6 From 791a9ed0a40dfa70fe793dcecf950273255cbb84 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:54 -0700 Subject: [PATCH 236/297] netlink: specs: fou: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 4eb77b4ecd3c ("netlink: add a proto specification for FOU") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/fou.yaml | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/netlink/specs/fou.yaml b/Documentation/netlink/specs/fou.yaml index 0af5ab842c04..b02ab19817d3 100644 --- a/Documentation/netlink/specs/fou.yaml +++ b/Documentation/netlink/specs/fou.yaml @@ -15,7 +15,7 @@ kernel-policy: global definitions: - type: enum - name: encap_type + name: encap-type name-prefix: fou-encap- enum-name: entries: [ unspec, direct, gue ] @@ -43,26 +43,26 @@ attribute-sets: name: type type: u8 - - name: remcsum_nopartial + name: remcsum-nopartial type: flag - - name: local_v4 + name: local-v4 type: u32 - - name: local_v6 + name: local-v6 type: binary checks: min-len: 16 - - name: peer_v4 + name: peer-v4 type: u32 - - name: peer_v6 + name: peer-v6 type: binary checks: min-len: 16 - - name: peer_port + name: peer-port type: u16 byte-order: big-endian - @@ -90,12 +90,12 @@ operations: - port - ipproto - type - - remcsum_nopartial - - local_v4 - - peer_v4 - - local_v6 - - peer_v6 - - peer_port + - remcsum-nopartial + - local-v4 + - peer-v4 + - local-v6 + - peer-v6 + - peer-port - ifindex - @@ -112,11 +112,11 @@ operations: - af - ifindex - port - - peer_port - - local_v4 - - peer_v4 - - local_v6 - - peer_v6 + - peer-port + - local-v4 + - peer-v4 + - local-v6 + - peer-v6 - name: get From 07caaf875c937e5f0a262a1dbad307d08ecc8673 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:55 -0700 Subject: [PATCH 237/297] netlink: specs: ethtool: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen replaces special chars in names) but gives more uniform naming in Python. Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash") Fixes: 46fb3ba95b93 ("ethtool: Add an interface for flashing transceiver modules' firmware") Reviewed-by: Kory Maincent Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 6 +++--- tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 72a076b0e1b5..348c6ad548f5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -48,7 +48,7 @@ definitions: name: started doc: The firmware flashing process has started. - - name: in_progress + name: in-progress doc: The firmware flashing process is in progress. - name: completed @@ -1422,7 +1422,7 @@ attribute-sets: name: hkey type: binary - - name: input_xfrm + name: input-xfrm type: u32 - name: start-context @@ -2238,7 +2238,7 @@ operations: - hfunc - indir - hkey - - input_xfrm + - input-xfrm dump: request: attributes: diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index f439c434ba36..648ff50bc1c3 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -38,7 +38,7 @@ def test_rss_input_xfrm(cfg, ipver): raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") input_xfrm = cfg.ethnl.rss_get( - {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') + {'header': {'dev-name': cfg.ifname}}).get('input-xfrm') # Check for symmetric xor/or-xor if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2): From 354592f19c7b0ac5983d56d8594a76aa3437f0a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:56 -0700 Subject: [PATCH 238/297] netlink: specs: dpll: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 3badff3a25d8 ("dpll: spec: Add Netlink spec in YAML") Reviewed-by: Vadim Fedorenko Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/dpll.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 8feefeae5376..f434140b538e 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -188,7 +188,7 @@ definitions: value: 10000 - type: const - name: pin-frequency-77_5-khz + name: pin-frequency-77-5-khz value: 77500 - type: const From 9407680945145cc34fafd0e9f802b03574620d43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:57 -0700 Subject: [PATCH 239/297] netlink: specs: devlink: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 429ac6211494 ("devlink: define enum for attr types of dynamic attributes") Fixes: f2f9dd164db0 ("netlink: specs: devlink: add the remaining command to generate complete split_ops") Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 05fee1b7fe19..38ddc04f9e6d 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -38,15 +38,15 @@ definitions: - name: dsa - - name: pci_pf + name: pci-pf - - name: pci_vf + name: pci-vf - name: virtual - name: unused - - name: pci_sf + name: pci-sf - type: enum name: port-fn-state @@ -220,7 +220,7 @@ definitions: - name: flag - - name: nul_string + name: nul-string value: 10 - name: binary From e40d3d0931d2b1e86ba2ca0e9c95f90ac6dd5525 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:58 -0700 Subject: [PATCH 240/297] netlink: specs: ovs_flow: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 93b230b549bc ("netlink: specs: add ynl spec for ovs_flow") Reviewed-by: Donald Hunter Reviewed-by: Ilya Maximets Reviewed-by: Eelco Chaudron Link: https://patch.msgid.link/20250624211002.3475021-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ovs_flow.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/ovs_flow.yaml b/Documentation/netlink/specs/ovs_flow.yaml index 46f5d1cd8a5f..7974aa7d8905 100644 --- a/Documentation/netlink/specs/ovs_flow.yaml +++ b/Documentation/netlink/specs/ovs_flow.yaml @@ -216,7 +216,7 @@ definitions: type: struct members: - - name: nd_target + name: nd-target type: binary len: 16 byte-order: big-endian @@ -258,12 +258,12 @@ definitions: type: struct members: - - name: vlan_tpid + name: vlan-tpid type: u16 byte-order: big-endian doc: Tag protocol identifier (TPID) to push. - - name: vlan_tci + name: vlan-tci type: u16 byte-order: big-endian doc: Tag control identifier (TCI) to push. From 9e6dd4c256d0774701637b958ba682eff4991277 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:59 -0700 Subject: [PATCH 241/297] netlink: specs: mptcp: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Reviewed-by: Davide Caratti Reviewed-by: Donald Hunter Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250624211002.3475021-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 8 ++++---- include/uapi/linux/mptcp_pm.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dfd017780d2f..fb57860fe778 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -57,21 +57,21 @@ definitions: doc: >- A new subflow has been established. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-closed doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-priority value: 13 doc: >- The priority of a subflow has changed. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: listener-created value: 15 @@ -255,7 +255,7 @@ attribute-sets: name: timeout type: u32 - - name: if_idx + name: if-idx type: u32 - name: reset-reason diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 84fa8a21dfd0..6ac84b2f636c 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -27,14 +27,14 @@ * token, rem_id. * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of * sk_err) could be set if an error has been detected for this subflow. * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - * daddr6, sport, dport, backup, if_idx [, error]. + * daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: * family, sport, saddr4 | saddr6. * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, From 8d7e211ea925e050882c762ddf8cf2da78856dfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:00 -0700 Subject: [PATCH 242/297] netlink: specs: rt-link: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: b2f63d904e72 ("doc/netlink: Add spec for rt link messages") Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index b41b31eebcae..28c4cf66517c 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -603,7 +603,7 @@ definitions: name: optmask type: u32 - - name: if_stats_msg + name: if-stats-msg type: struct members: - @@ -2486,7 +2486,7 @@ operations: name: getstats doc: Get / dump link stats. attribute-set: stats-attrs - fixed-header: if_stats_msg + fixed-header: if-stats-msg do: request: value: 94 From eef0eaeca7fa8e358a31e89802f564451b797718 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:01 -0700 Subject: [PATCH 243/297] netlink: specs: tc: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: a1bcfde83669 ("doc/netlink/specs: Add a spec for tc") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/tc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index cb7ea7d62e56..42d74c9aeb54 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -232,7 +232,7 @@ definitions: type: u8 doc: log(P_max / (qth-max - qth-min)) - - name: Scell_log + name: Scell-log type: u8 doc: cell size for idle damping - @@ -253,7 +253,7 @@ definitions: name: DPs type: u32 - - name: def_DP + name: def-DP type: u32 - name: grio From af852f1f1c951d43b36881302fd10d9f898cdb54 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:02 -0700 Subject: [PATCH 244/297] netlink: specs: enforce strict naming of properties Add a regexp to make sure all names which may end up being visible to the user consist of lower case characters, numbers and dashes. Underscores keep sneaking into the specs, which is not visible in the C code but makes the Python and alike inconsistent. Note that starting with a number is okay, as in C the full name will include the family name. For legacy families we can't enforce the naming in the family name or the multicast group names, as these are part of the binary uAPI of the kernel. For classic netlink we need to allow capital letters in names of struct members. TC has some structs with capitalized members. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-legacy.yaml | 15 +++++++++------ Documentation/netlink/genetlink.yaml | 17 ++++++++++------- Documentation/netlink/netlink-raw.yaml | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 4cbfe666e6f5..b29d62eefa16 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -6,6 +6,9 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ uint: type: integer minimum: 0 @@ -76,7 +79,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -103,7 +106,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -132,7 +135,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: description: The netlink attribute type enum: [ u8, u16, u32, u64, s8, s16, s32, s64, string, binary ] @@ -169,7 +172,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -206,7 +209,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, @@ -348,7 +351,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index 40efbbad76ab..7b1ec153e834 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -6,6 +6,9 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ uint: type: integer minimum: 0 @@ -29,7 +32,7 @@ additionalProperties: False properties: name: description: Name of the genetlink family. - type: string + $ref: '#/$defs/name' doc: type: string protocol: @@ -48,7 +51,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -75,7 +78,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -96,7 +99,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -121,7 +124,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type enum: [ unused, pad, flag, binary, uint, sint, u8, u16, u32, u64, s8, s16, s32, s64, @@ -243,7 +246,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string @@ -327,7 +330,7 @@ properties: name: description: | The name for the group, used to form the define and the value of the define. - type: string + $ref: '#/$defs/name' flags: *cmd_flags kernel-family: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index e34bf23897fa..246fa07bccf6 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -6,6 +6,12 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ + name-cap: + type: string + pattern: ^[0-9a-zA-Z-]+$ uint: type: integer minimum: 0 @@ -71,7 +77,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -98,7 +104,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -124,7 +130,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name-cap' type: description: | The netlink attribute type. Members of type 'binary' or 'pad' @@ -183,7 +189,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -220,7 +226,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, @@ -408,7 +414,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string From 4a5e85f4eb8fd18b1266342d100e4f0849544ca0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 17 Jun 2025 16:35:32 +0200 Subject: [PATCH 245/297] fs/proc/task_mmu: fix PAGE_IS_PFNZERO detection for the huge zero folio is_zero_pfn() does not work for the huge zero folio. Fix it by using is_huge_zero_pmd(). This can cause the PAGEMAP_SCAN ioctl against /proc/pid/pagemap to present pages as PAGE_IS_PRESENT rather than as PAGE_IS_PFNZERO. Found by code inspection. Link: https://lkml.kernel.org/r/20250617143532.2375383-1-david@redhat.com Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: David Hildenbrand Cc: Muhammad Usama Anjum Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 27972c0749e7..4be91eb6ea5c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2182,7 +2182,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_FILE; } - if (is_zero_pfn(pmd_pfn(pmd))) + if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; From 86bc5752a9173589271439911b33b4d952b0a91f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 17 Jun 2025 10:58:19 +0200 Subject: [PATCH 246/297] mm: add OOM killer maintainer structure Add MAINTAINERS info for the oom-killer. [akpm@linux-foundation.org: fix mhocko email address (SeongJae), add files (Lorenzo)] [akpm@linux-foundation.org: fix ordering] Link: https://lkml.kernel.org/r/20250617085819.355838-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Shakeel Butt Acked-by: Lorenzo Stoakes Acked-by: SeongJae Park Acked-by: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index efb51ee92683..689b9a20b6df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15848,6 +15848,17 @@ F: mm/numa.c F: mm/numa_emulation.c F: mm/numa_memblks.c +MEMORY MANAGEMENT - OOM KILLER +M: Michal Hocko +R: David Rientjes +R: Shakeel Butt +L: linux-mm@kvack.org +S: Maintained +F: include/linux/oom.h +F: include/trace/events/oom.h +F: include/uapi/linux/oom.h +F: mm/oom_kill.c + MEMORY MANAGEMENT - PAGE ALLOCATOR M: Andrew Morton M: Vlastimil Babka From 71aa17fd981be2ef683bf83326128c505abe070c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 18 Jun 2025 19:47:52 +0300 Subject: [PATCH 247/297] MAINTAINERS: add tree entry to mm init block Link: https://lkml.kernel.org/r/aFLubPfiO5hqfhCe@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 689b9a20b6df..fc6cc0b2e9a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15676,6 +15676,8 @@ MEMBLOCK AND MEMORY MANAGEMENT INITIALIZATION M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git for-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git fixes F: Documentation/core-api/boot-time-mm.rst F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h From 3746351e876b42f5221c2f85ba60965b38494ff2 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 18 Jun 2025 11:59:53 +0100 Subject: [PATCH 248/297] MAINTAINERS: add missing files to mm page alloc section There are a number of files within memory management which appear to be most suitably placed within the page allocation section of MAINTAINERS and are otherwise unassigned, so place these there. Link: https://lkml.kernel.org/r/20250618105953.67630-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Brendan Jackman Acked-by: Zi Yan Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fc6cc0b2e9a3..4e131bb97894 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15875,8 +15875,17 @@ F: include/linux/compaction.h F: include/linux/gfp.h F: include/linux/page-isolation.h F: mm/compaction.c +F: mm/debug_page_alloc.c +F: mm/fail_page_alloc.c F: mm/page_alloc.c +F: mm/page_ext.c +F: mm/page_frag_cache.c F: mm/page_isolation.c +F: mm/page_owner.c +F: mm/page_poison.c +F: mm/page_reporting.c +F: mm/show_mem.c +F: mm/shuffle.c MEMORY MANAGEMENT - RECLAIM M: Andrew Morton From 344ef45b03336e7f74658814f66483b5417c9cf1 Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Tue, 27 May 2025 11:36:50 +0800 Subject: [PATCH 249/297] mm/hugetlb: remove unnecessary holding of hugetlb_lock In isolate_or_dissolve_huge_folio(), after acquiring the hugetlb_lock, it is only for the purpose of obtaining the correct hstate, which is then passed to alloc_and_dissolve_hugetlb_folio(). alloc_and_dissolve_hugetlb_folio() itself also acquires the hugetlb_lock. We can have alloc_and_dissolve_hugetlb_folio() obtain the hstate by itself, so that isolate_or_dissolve_huge_folio() no longer needs to acquire the hugetlb_lock. In addition, we keep the folio_test_hugetlb() check within isolate_or_dissolve_huge_folio(). By doing so, we can avoid disrupting the normal path by vainly holding the hugetlb_lock. replace_free_hugepage_folios() has the same issue, and we should address it as well. Addresses a possible performance problem which was added by the hotfix 113ed54ad276 ("mm/hugetlb: fix kernel NULL pointer dereference when replacing free hugetlb folios"). Link: https://lkml.kernel.org/r/1748317010-16272-1-git-send-email-yangge1116@126.com Fixes: 113ed54ad276 ("mm/hugetlb: fix kernel NULL pointer dereference when replacing free hugetlb folios") Signed-off-by: Ge Yang Suggested-by: Oscar Salvador Reviewed-by: Muchun Song Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 54 +++++++++++++++++----------------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8746ed2fec13..9dc95eac558c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2787,20 +2787,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2829,8 +2833,10 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2874,35 +2880,24 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } @@ -2916,7 +2911,6 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct hstate *h; struct folio *folio; int ret = 0; @@ -2925,23 +2919,9 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); - /* - * The folio might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); - start_pfn++; - continue; - } - spin_unlock_irq(&hugetlb_lock); - - if (!folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(h, folio, - &isolate_list); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); if (ret) break; From df831e97739405ecbaddb85516bc7d4d1c933d6b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 19 Jun 2025 21:26:55 +0800 Subject: [PATCH 250/297] lib/group_cpus: fix NULL pointer dereference from group_cpus_evenly() While testing null_blk with configfs, echo 0 > poll_queues will trigger following panic: BUG: kernel NULL pointer dereference, address: 0000000000000010 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 27 UID: 0 PID: 920 Comm: bash Not tainted 6.15.0-02023-gadbdb95c8696-dirty #1238 PREEMPT(undef) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014 RIP: 0010:__bitmap_or+0x48/0x70 Call Trace: __group_cpus_evenly+0x822/0x8c0 group_cpus_evenly+0x2d9/0x490 blk_mq_map_queues+0x1e/0x110 null_map_queues+0xc9/0x170 [null_blk] blk_mq_update_queue_map+0xdb/0x160 blk_mq_update_nr_hw_queues+0x22b/0x560 nullb_update_nr_hw_queues+0x71/0xf0 [null_blk] nullb_device_poll_queues_store+0xa4/0x130 [null_blk] configfs_write_iter+0x109/0x1d0 vfs_write+0x26e/0x6f0 ksys_write+0x79/0x180 __x64_sys_write+0x1d/0x30 x64_sys_call+0x45c4/0x45f0 do_syscall_64+0xa5/0x240 entry_SYSCALL_64_after_hwframe+0x76/0x7e Root cause is that numgrps is set to 0, and ZERO_SIZE_PTR is returned from kcalloc(), and later ZERO_SIZE_PTR will be deferenced. Fix the problem by checking numgrps first in group_cpus_evenly(), and return NULL directly if numgrps is zero. [yukuai3@huawei.com: also fix the non-SMP version] Link: https://lkml.kernel.org/r/20250620010958.1265984-1-yukuai1@huaweicloud.com Link: https://lkml.kernel.org/r/20250619132655.3318883-1-yukuai1@huaweicloud.com Fixes: 6a6dcae8f486 ("blk-mq: Build default queue map via group_cpus_evenly()") Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Jens Axboe Cc: ErKun Yang Cc: John Garry Cc: Thomas Gleinxer Cc: "zhangyi (F)" Cc: Signed-off-by: Andrew Morton --- lib/group_cpus.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/group_cpus.c b/lib/group_cpus.c index ee272c4cefcc..18d43a406114 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -352,6 +352,9 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) int ret = -ENOMEM; struct cpumask *masks = NULL; + if (numgrps == 0) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -426,8 +429,12 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) #else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps) { - struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + struct cpumask *masks; + if (numgrps == 0) + return NULL; + + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) return NULL; From f5769359c5b241978e6933672bb78b3adc36aa18 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Fri, 20 Jun 2025 02:31:54 +0800 Subject: [PATCH 251/297] mm/alloc_tag: fix the kmemleak false positive issue in the allocation of the percpu variable tag->counters When loading a module, as long as the module has memory allocation operations, kmemleak produces a false positive report that resembles the following: unreferenced object (percpu) 0x7dfd232a1650 (size 16): comm "modprobe", pid 1301, jiffies 4294940249 hex dump (first 16 bytes on cpu 2): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): kmemleak_alloc_percpu+0xb4/0xd0 pcpu_alloc_noprof+0x700/0x1098 load_module+0xd4/0x348 codetag_module_init+0x20c/0x450 codetag_load_module+0x70/0xb8 load_module+0xef8/0x1608 init_module_from_file+0xec/0x158 idempotent_init_module+0x354/0x608 __arm64_sys_finit_module+0xbc/0x150 invoke_syscall+0xd4/0x258 el0_svc_common.constprop.0+0xb4/0x240 do_el0_svc+0x48/0x68 el0_svc+0x40/0xf8 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x1ac/0x1b0 This is because the module can only indirectly reference alloc_tag_counters through the alloc_tag section, which misleads kmemleak. However, we don't have a kmemleak ignore interface for percpu allocations yet. So let's create one and invoke it for tag->counters. [gehao@kylinos.cn: fix build error when CONFIG_DEBUG_KMEMLEAK=n, s/igonore/ignore/] Link: https://lkml.kernel.org/r/20250620093102.2416767-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20250619183154.2122608-1-hao.ge@linux.dev Fixes: 12ca42c23775 ("alloc_tag: allocate percpu counters for module tags dynamically") Signed-off-by: Hao Ge Reviewed-by: Catalin Marinas Acked-by: Suren Baghdasaryan [lib/alloc_tag.c] Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/kmemleak.h | 4 ++++ lib/alloc_tag.c | 8 +++++++- mm/kmemleak.c | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 93a73c076d16..fbd424b2abb1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,6 +28,7 @@ extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; +extern void kmemleak_ignore_percpu(const void __percpu *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, @@ -97,6 +98,9 @@ static inline void kmemleak_not_leak(const void *ptr) static inline void kmemleak_transient_leak(const void *ptr) { } +static inline void kmemleak_ignore_percpu(const void __percpu *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d48b80f3f007..3a74d63a959e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -10,6 +10,7 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) @@ -632,8 +633,13 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag mod->name); return -ENOMEM; } - } + /* + * Avoid a kmemleak false positive. The pointer to the counters is stored + * in the alloc_tag section of the module and cannot be directly accessed. + */ + kmemleak_ignore_percpu(tag->counters); + } return 0; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index da9cee34ee1b..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1246,6 +1246,20 @@ void __ref kmemleak_transient_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_transient_leak); +/** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + /** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object From 4f489fe6afb395dbc79840efa3c05440b760d883 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jun 2025 11:36:07 -0700 Subject: [PATCH 252/297] mm/damon/sysfs-schemes: free old damon_sysfs_scheme_filter->memcg_path on write memcg_path_store() assigns a newly allocated memory buffer to filter->memcg_path, without deallocating the previously allocated and assigned memory buffer. As a result, users can leak kernel memory by continuously writing a data to memcg_path DAMOS sysfs file. Fix the leak by deallocating the previously set memory buffer. Link: https://lkml.kernel.org/r/20250619183608.6647-2-sj@kernel.org Fixes: 7ee161f18b5d ("mm/damon/sysfs-schemes: implement filter directory") Signed-off-by: SeongJae Park Cc: Shuah Khan Cc: [6.3.x] Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 0f6c9e1fec0b..30ae7518ffbf 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -472,6 +472,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } From 79300ac805b672a84b64d80d4cbc374d83411599 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 19 Jun 2025 15:51:05 -0700 Subject: [PATCH 253/297] scripts/gdb: fix dentry_name() lookup The "d_iname" member was replaced with "d_shortname.string" in the commit referenced in the Fixes tag. This prevented the GDB script "lx-mount" command to properly function: (gdb) lx-mounts mount super_block devname pathname fstype options 0xff11000002d21180 0xff11000002d24800 rootfs / rootfs rw 0 0 0xff11000002e18a80 0xff11000003713000 /dev/root / ext4 rw,relatime 0 0 Python Exception : There is no member named d_iname. Error occurred in Python: There is no member named d_iname. Link: https://lkml.kernel.org/r/20250619225105.320729-1-florian.fainelli@broadcom.com Fixes: 58cf9c383c5c ("dcache: back inline names with a struct-wrapped array of unsigned long") Signed-off-by: Florian Fainelli Cc: Al Viro Cc: Jan Kara Cc: Jan Kiszka Cc: Jeff Layton Cc: Kieran Bingham Cc: Signed-off-by: Andrew Morton --- scripts/gdb/linux/vfs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/vfs.py b/scripts/gdb/linux/vfs.py index c77b9ce75f6d..b5fbb18ccb77 100644 --- a/scripts/gdb/linux/vfs.py +++ b/scripts/gdb/linux/vfs.py @@ -22,7 +22,7 @@ def dentry_name(d): if parent == d or parent == 0: return "" p = dentry_name(d['d_parent']) + "/" - return p + d['d_iname'].string() + return p + d['d_shortname']['string'].string() class DentryName(gdb.Function): """Return string of the full path of a dentry. From befd9a71d859ea625eaa84dae1b243efb3df3eca Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Sun, 22 Jun 2025 01:13:51 +0800 Subject: [PATCH 254/297] fuse: fix runtime warning on truncate_folio_batch_exceptionals() The WARN_ON_ONCE is introduced on truncate_folio_batch_exceptionals() to capture whether the filesystem has removed all DAX entries or not. And the fix has been applied on the filesystem xfs and ext4 by the commit 0e2f80afcfa6 ("fs/dax: ensure all pages are idle prior to filesystem unmount"). Apply the missed fix on filesystem fuse to fix the runtime warning: [ 2.011450] ------------[ cut here ]------------ [ 2.011873] WARNING: CPU: 0 PID: 145 at mm/truncate.c:89 truncate_folio_batch_exceptionals+0x272/0x2b0 [ 2.012468] Modules linked in: [ 2.012718] CPU: 0 UID: 1000 PID: 145 Comm: weston Not tainted 6.16.0-rc2-WSL2-STABLE #2 PREEMPT(undef) [ 2.013292] RIP: 0010:truncate_folio_batch_exceptionals+0x272/0x2b0 [ 2.013704] Code: 48 63 d0 41 29 c5 48 8d 1c d5 00 00 00 00 4e 8d 6c 2a 01 49 c1 e5 03 eb 09 48 83 c3 08 49 39 dd 74 83 41 f6 44 1c 08 01 74 ef <0f> 0b 49 8b 34 1e 48 89 ef e8 10 a2 17 00 eb df 48 8b 7d 00 e8 35 [ 2.014845] RSP: 0018:ffffa47ec33f3b10 EFLAGS: 00010202 [ 2.015279] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 2.015884] RDX: 0000000000000000 RSI: ffffa47ec33f3ca0 RDI: ffff98aa44f3fa80 [ 2.016377] RBP: ffff98aa44f3fbf0 R08: ffffa47ec33f3ba8 R09: 0000000000000000 [ 2.016942] R10: 0000000000000001 R11: 0000000000000000 R12: ffffa47ec33f3ca0 [ 2.017437] R13: 0000000000000008 R14: ffffa47ec33f3ba8 R15: 0000000000000000 [ 2.017972] FS: 000079ce006afa40(0000) GS:ffff98aade441000(0000) knlGS:0000000000000000 [ 2.018510] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.018987] CR2: 000079ce03e74000 CR3: 000000010784f006 CR4: 0000000000372eb0 [ 2.019518] Call Trace: [ 2.019729] [ 2.019901] truncate_inode_pages_range+0xd8/0x400 [ 2.020280] ? timerqueue_add+0x66/0xb0 [ 2.020574] ? get_nohz_timer_target+0x2a/0x140 [ 2.020904] ? timerqueue_add+0x66/0xb0 [ 2.021231] ? timerqueue_del+0x2e/0x50 [ 2.021646] ? __remove_hrtimer+0x39/0x90 [ 2.022017] ? srso_alias_untrain_ret+0x1/0x10 [ 2.022497] ? psi_group_change+0x136/0x350 [ 2.023046] ? _raw_spin_unlock+0xe/0x30 [ 2.023514] ? finish_task_switch.isra.0+0x8d/0x280 [ 2.024068] ? __schedule+0x532/0xbd0 [ 2.024551] fuse_evict_inode+0x29/0x190 [ 2.025131] evict+0x100/0x270 [ 2.025641] ? _atomic_dec_and_lock+0x39/0x50 [ 2.026316] ? __pfx_generic_delete_inode+0x10/0x10 [ 2.026843] __dentry_kill+0x71/0x180 [ 2.027335] dput+0xeb/0x1b0 [ 2.027725] __fput+0x136/0x2b0 [ 2.028054] __x64_sys_close+0x3d/0x80 [ 2.028469] do_syscall_64+0x6d/0x1b0 [ 2.028832] ? clear_bhb_loop+0x30/0x80 [ 2.029182] ? clear_bhb_loop+0x30/0x80 [ 2.029533] ? clear_bhb_loop+0x30/0x80 [ 2.029902] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 2.030423] RIP: 0033:0x79ce03d0d067 [ 2.030820] Code: b8 ff ff ff ff e9 3e ff ff ff 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 a7 f8 ff [ 2.032354] RSP: 002b:00007ffef0498948 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 2.032939] RAX: ffffffffffffffda RBX: 00007ffef0498960 RCX: 000079ce03d0d067 [ 2.033612] RDX: 0000000000000003 RSI: 0000000000001000 RDI: 000000000000000d [ 2.034289] RBP: 00007ffef0498a30 R08: 000000000000000d R09: 0000000000000000 [ 2.034944] R10: 00007ffef0498978 R11: 0000000000000246 R12: 0000000000000001 [ 2.035610] R13: 00007ffef0498960 R14: 000079ce03e09ce0 R15: 0000000000000003 [ 2.036301] [ 2.036532] ---[ end trace 0000000000000000 ]--- Link: https://lkml.kernel.org/r/20250621171507.3770-1-haiyuewa@163.com Fixes: bde708f1a65d ("fs/dax: always remove DAX page-cache entries when breaking layouts") Signed-off-by: Haiyue Wang Cc: Alistair Popple Cc: Dan Williams Cc: Miklos Szeredi Cc: Signed-off-by: Andrew Morton --- fs/fuse/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bfe8d8af46f3..9572bdef49ee 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -9,6 +9,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" +#include #include #include #include @@ -162,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { From b160a5cc6ac761f341e58e48ae3f03a04d310677 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 20 Jun 2025 19:53:53 +0800 Subject: [PATCH 255/297] mailmap: add entries for Zijun Hu Map my old qualcomm email addresses: Zijun Hu Zijun Hu To the current one: Zijun Hu Link: https://lkml.kernel.org/r/20250620-my_mailmap-v1-1-11ea3db8ba1e@oss.qualcomm.com Signed-off-by: Zijun Hu Cc: Hans verkuil Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index d57531dab08b..85f13b941f08 100644 --- a/.mailmap +++ b/.mailmap @@ -830,3 +830,5 @@ Yosry Ahmed Yusuke Goda Zack Rusin Zhu Yanjun +Zijun Hu +Zijun Hu From c9e8efa0b384c43670852f955567b31a5eb4ac2b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 20 Jun 2025 19:53:54 +0800 Subject: [PATCH 256/297] mailmap: correct name for a historical account of Zijun Hu Correct the name for from 'zijun_hu' to 'Zijun Hu'. Link: https://lkml.kernel.org/r/20250620-my_mailmap-v1-2-11ea3db8ba1e@oss.qualcomm.com Signed-off-by: Zijun Hu Cc: Hans verkuil Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 85f13b941f08..a5534ff9c1d7 100644 --- a/.mailmap +++ b/.mailmap @@ -832,3 +832,4 @@ Zack Rusin Zhu Yanjun Zijun Hu Zijun Hu +Zijun Hu From b6f5e748587063d4ad3294d07ca29886851d9cd7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:21:22 +0200 Subject: [PATCH 257/297] crashdump: add CONFIG_KEYS dependency The dm_crypt code fails to build without CONFIG_KEYS: kernel/crash_dump_dm_crypt.c: In function 'restore_dm_crypt_keys_to_thread_keyring': kernel/crash_dump_dm_crypt.c:105:9: error: unknown type name 'key_ref_t'; did you mean 'key_ref_put'? There is a mix of 'select KEYS' and 'depends on KEYS' in Kconfig, so there is no single obvious solution here, but generally using 'depends on' makes more sense and is less likely to cause dependency loops. Link: https://lkml.kernel.org/r/20250620112140.3396316-1-arnd@kernel.org Fixes: 62f17d9df692 ("crash_dump: retrieve dm crypt keys in kdump kernel") Signed-off-by: Arnd Bergmann Cc: Alexander Graf Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index e64ce21f9a80..2ee603a98813 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -134,6 +134,7 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT + depends on KEYS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys From 7c942f87cc0be5699b1ce434d369eccd8b5321d4 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 20 Jun 2025 16:41:50 +0530 Subject: [PATCH 258/297] selftests/mm: fix validate_addr() helper validate_addr() checks whether the address returned by mmap() lies in the low or high VA space, according to whether a high addr hint was passed or not. The fix commit mentioned below changed the code in such a way that this function will always return failure when passed high_addr == 1; addr will be >= HIGH_ADDR_MARK always, we will fall down to "if (addr > HIGH_ADDR_MARK)" and return failure. Fix this. Link: https://lkml.kernel.org/r/20250620111150.50344-1-dev.jain@arm.com Fixes: d1d86ce28d0f ("selftests/mm: virtual_address_range: conform to TAP format output") Signed-off-by: Dev Jain Reviewed-by: Donet Tom Acked-by: David Hildenbrand Cc: Anshuman Khandual Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index b380e102b22f..169dbd692bf5 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -77,8 +77,11 @@ static void validate_addr(char *ptr, int high_addr) { unsigned long addr = (unsigned long) ptr; - if (high_addr && addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); + if (high_addr) { + if (addr < HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); + return; + } if (addr > HIGH_ADDR_MARK) ksft_exit_fail_msg("Bad address %lx\n", addr); From 02d67850aecaf2306d849a3c804a125bb4f9c5f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Fri, 20 Jun 2025 11:25:35 +0200 Subject: [PATCH 259/297] =?UTF-8?q?mailmap:=20update=20Duje=20Mihanovi?= =?UTF-8?q?=C4=87's=20email=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm switching to a new mail address, so map my old one to it. Link: https://lkml.kernel.org/r/20250620-mailmap-v1-1-a6b4b72dbd07@dujemihanovic.xyz Signed-off-by: Duje Mihanović Cc: Karel Balej Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index a5534ff9c1d7..bd18a7c9d2ce 100644 --- a/.mailmap +++ b/.mailmap @@ -223,6 +223,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert + Ed L. Cashin Elliot Berman Enric Balletbo i Serra From c0cb210a87fcdda3c25f43b5a64420e6b07d3f53 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 25 Jun 2025 10:52:31 +0100 Subject: [PATCH 260/297] MAINTAINERS: add Lorenzo as THP co-maintainer I am doing a great deal of review and getting ever more involved in THP with intent to do more so in future also, so add myself as co-maintainer to help David with workload. Link: https://lkml.kernel.org/r/20250625095231.42874-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Baolin Wang Acked-by: Dev Jain Acked-by: Zi Yan Acked-by: Oscar Salvador Cc: Barry Song Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4e131bb97894..d4fa7884b4a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15945,9 +15945,9 @@ F: mm/swapfile.c MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) M: Andrew Morton M: David Hildenbrand +M: Lorenzo Stoakes R: Zi Yan R: Baolin Wang -R: Lorenzo Stoakes R: Liam R. Howlett R: Nico Pache R: Ryan Roberts From a433791aeaea6e84df709e0b9584b9bbe040cd1c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 24 Jun 2025 14:45:00 -0700 Subject: [PATCH 261/297] atm: Release atm_dev_mutex after removing procfs in atm_dev_deregister(). syzbot reported a warning below during atm_dev_register(). [0] Before creating a new device and procfs/sysfs for it, atm_dev_register() looks up a duplicated device by __atm_dev_lookup(). These operations are done under atm_dev_mutex. However, when removing a device in atm_dev_deregister(), it releases the mutex just after removing the device from the list that __atm_dev_lookup() iterates over. So, there will be a small race window where the device does not exist on the device list but procfs/sysfs are still not removed, triggering the splat. Let's hold the mutex until procfs/sysfs are removed in atm_dev_deregister(). [0]: proc_dir_entry 'atm/atmtcp:0' already registered WARNING: CPU: 0 PID: 5919 at fs/proc/generic.c:377 proc_register+0x455/0x5f0 fs/proc/generic.c:377 Modules linked in: CPU: 0 UID: 0 PID: 5919 Comm: syz-executor284 Not tainted 6.16.0-rc2-syzkaller-00047-g52da431bf03b #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 RIP: 0010:proc_register+0x455/0x5f0 fs/proc/generic.c:377 Code: 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 a2 01 00 00 48 8b 44 24 10 48 c7 c7 20 c0 c2 8b 48 8b b0 d8 00 00 00 e8 0c 02 1c ff 90 <0f> 0b 90 90 48 c7 c7 80 f2 82 8e e8 0b de 23 09 48 8b 4c 24 28 48 RSP: 0018:ffffc9000466fa30 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817ae248 RDX: ffff888026280000 RSI: ffffffff817ae255 RDI: 0000000000000001 RBP: ffff8880232bed48 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff888076ed2140 R13: dffffc0000000000 R14: ffff888078a61340 R15: ffffed100edda444 FS: 00007f38b3b0c6c0(0000) GS:ffff888124753000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f38b3bdf953 CR3: 0000000076d58000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_create_data+0xbe/0x110 fs/proc/generic.c:585 atm_proc_dev_register+0x112/0x1e0 net/atm/proc.c:361 atm_dev_register+0x46d/0x890 net/atm/resources.c:113 atmtcp_create+0x77/0x210 drivers/atm/atmtcp.c:369 atmtcp_attach drivers/atm/atmtcp.c:403 [inline] atmtcp_ioctl+0x2f9/0xd60 drivers/atm/atmtcp.c:464 do_vcc_ioctl+0x12c/0x930 net/atm/ioctl.c:159 sock_do_ioctl+0x115/0x280 net/socket.c:1190 sock_ioctl+0x227/0x6b0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18b/0x210 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f38b3b74459 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f38b3b0c198 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f38b3bfe318 RCX: 00007f38b3b74459 RDX: 0000000000000000 RSI: 0000000000006180 RDI: 0000000000000005 RBP: 00007f38b3bfe310 R08: 65732f636f72702f R09: 65732f636f72702f R10: 65732f636f72702f R11: 0000000000000246 R12: 00007f38b3bcb0ac R13: 00007f38b3b0c1a0 R14: 0000200000000200 R15: 00007f38b3bcb03b Fixes: 64bf69ddff76 ("[ATM]: deregistration removes device from atm_devs list immediately") Reported-by: syzbot+8bd335d2ad3b93e80715@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/685316de.050a0220.216029.0087.GAE@google.com/ Tested-by: syzbot+8bd335d2ad3b93e80715@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250624214505.570679-1-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- net/atm/resources.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/atm/resources.c b/net/atm/resources.c index 995d29e7fb13..b19d851e1f44 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -146,11 +146,10 @@ void atm_dev_deregister(struct atm_dev *dev) */ mutex_lock(&atm_dev_mutex); list_del(&dev->dev_list); - mutex_unlock(&atm_dev_mutex); - atm_dev_release_vccs(dev); atm_unregister_sysfs(dev); atm_proc_dev_deregister(dev); + mutex_unlock(&atm_dev_mutex); atm_dev_put(dev); } From 1dcea07810c8bb4aba1bfed0bbe015b2d708a903 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Jun 2025 12:17:51 -0400 Subject: [PATCH 262/297] bcachefs: btree_root_unreadable_and_scan_found_nothing should not be autofix Autofix is specified in btree_gc.c if it's not an important btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 041887aad63a..0641fb634bd4 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -301,7 +301,7 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ From 3e72acb78b73ccffeaf929c039dc5a0a7a147535 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Jun 2025 12:45:11 -0400 Subject: [PATCH 263/297] bcachefs: Ensure btree node scan runs before checking for scanned nodes Previously, calling bch2_btree_has_scanned_nodes() when btree node scan hadn't actually run would erroniously return false - causing us to think a btree was entirely gone. This fixes a 6.16 regression from moving the scheduling of btree node scan out of bch2_btree_lost_data() (fixing the bug where we'd schedule it persistently in the superblock) and only scheduling it when check_toploogy() is asking for scanned btree nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 29 ++++++++++++++++++----------- fs/bcachefs/btree_node_scan.c | 6 +++++- fs/bcachefs/btree_node_scan.h | 2 +- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 697c6ecc3a65..bac108e93823 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -534,32 +534,39 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct return ret; } -static int bch2_check_root(struct btree_trans *trans, enum btree_id i, +static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, bool *reconstructed_root) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, i); + struct btree_root *r = bch2_btree_id_root(c, btree); struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_id_to_text(&buf, i); + bch2_btree_id_to_text(&buf, btree); if (r->error) { bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - r->alive = false; - r->error = 0; + ret = bch2_btree_has_scanned_nodes(c, btree); + if (ret < 0) + goto err; - if (!bch2_btree_has_scanned_nodes(c, i)) { + if (!ret) { __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), + FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); + + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 0); } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 1); + + bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); if (ret) goto err; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a35847734a60..23d8c62ea4b6 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -521,8 +521,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) return false; } -bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) { + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + struct found_btree_node search = { .btree_id = btree, .level = 0, diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h index 08687b209787..66e6f9ed19d0 100644 --- a/fs/bcachefs/btree_node_scan.h +++ b/fs/bcachefs/btree_node_scan.h @@ -4,7 +4,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *); bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); void bch2_find_btree_nodes_exit(struct find_btree_nodes *); From 64b6a788bd96a0cdc11073a2d1f85413b078c1f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Jun 2025 00:48:14 -0400 Subject: [PATCH 264/297] bcachefs: Ensure we rewind to run recovery passes Fix a 6.16 regression from the recovery pass rework, which introduced a bug where calling bch2_run_explicit_recovery_pass() would only return the error code to rewind recovery for the first call that scheduled that recovery pass. If the error code from the first call was swallowed (because it was called by an asynchronous codepath), subsequent calls would go "ok, this pass is already marked as needing to run" and return 0. Fixing this ensures that check_topology bails out to run btree_node_scan before doing any repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index c2c18c0a5429..c09ed2dd4639 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -313,6 +313,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c, */ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); if (persistent ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) @@ -323,6 +326,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c, (r->passes_ratelimiting & BIT_ULL(pass))) return true; + if (rewind) + return true; + return false; } @@ -337,7 +343,6 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct bch_fs_recovery *r = &c->recovery; int ret = 0; - lockdep_assert_held(&c->sb_lock); bch2_printbuf_make_room(out, 1024); @@ -408,10 +413,8 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, { int ret = 0; - scoped_guard(mutex, &c->sb_lock) { - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - + if (recovery_pass_needs_set(c, pass, &flags)) { + guard(mutex)(&c->sb_lock); ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); bch2_write_super(c); } From ef6fac0f9e5d0695cee1d820c727fe753eca52d5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Jun 2025 14:55:59 -0400 Subject: [PATCH 265/297] bcachefs: Plumb correct ip to trans_relock_fail tracepoint Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 91a51aef82f1..bed2b4b6ffb9 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -771,7 +771,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) } static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace) + struct get_locks_fail *f, bool trace, ulong ip) { if (!trace) goto out; @@ -796,7 +796,7 @@ static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, st prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + trace_trans_restart_relock(trans, ip, buf.buf); printbuf_exit(&buf); } @@ -806,7 +806,7 @@ static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, st bch2_trans_verify_locks(trans); } -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) { bch2_trans_verify_locks(trans); @@ -825,7 +825,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) if (path->should_be_locked && (ret = btree_path_get_locks(trans, path, false, &f, BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace); + bch2_trans_relock_fail(trans, path, &f, trace, ip); return ret; } } @@ -838,12 +838,12 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) int bch2_trans_relock(struct btree_trans *trans) { - return __bch2_trans_relock(trans, true); + return __bch2_trans_relock(trans, true, _RET_IP_); } int bch2_trans_relock_notrace(struct btree_trans *trans) { - return __bch2_trans_relock(trans, false); + return __bch2_trans_relock(trans, false, _RET_IP_); } void bch2_trans_unlock(struct btree_trans *trans) From 7ab6847a03229e73bb7c58ca397630f699e79b53 Mon Sep 17 00:00:00 2001 From: Salvatore Bonaccorso Date: Wed, 25 Jun 2025 20:41:28 +0200 Subject: [PATCH 266/297] ALSA: hda/realtek: Fix built-in mic on ASUS VivoBook X507UAR The built-in mic of ASUS VivoBook X507UAR is broken recently by the fix of the pin sort. The fixup ALC256_FIXUP_ASUS_MIC_NO_PRESENCE is working for addressing the regression, too. Fixes: 3b4309546b48 ("ALSA: hda: Fix headset detection failure due to unstable sort") Reported-by: Igor Tamara Closes: https://bugs.debian.org/1108069 Signed-off-by: Salvatore Bonaccorso Link: https://lore.kernel.org/CADdHDco7_o=4h_epjEAb92Dj-vUz_PoTC2-W9g5ncT2E0NzfeQ@mail.gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 784644a2b51d..5d6d01ecfee2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11031,6 +11031,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1df3, "ASUS UM5606WA", ALC294_FIXUP_BASS_SPEAKER_15), SND_PCI_QUIRK(0x1043, 0x1264, "ASUS UM5606KA", ALC294_FIXUP_BASS_SPEAKER_15), SND_PCI_QUIRK(0x1043, 0x1e02, "ASUS UX3402ZA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1e10, "ASUS VivoBook X507UAR", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), SND_PCI_QUIRK(0x1043, 0x1e12, "ASUS UM3402", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e1f, "ASUS Vivobook 15 X1504VAP", ALC2XX_FIXUP_HEADSET_MIC), From 1476b218327b89bbb64c14619a2d34f0c320f2c3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 25 Jun 2025 18:07:37 +0100 Subject: [PATCH 267/297] perf/aux: Fix pending disable flow when the AUX ring buffer overruns If an AUX event overruns, the event core layer intends to disable the event by setting the 'pending_disable' flag. Unfortunately, the event is not actually disabled afterwards. In commit: ca6c21327c6a ("perf: Fix missing SIGTRAPs") the 'pending_disable' flag was changed to a boolean. However, the AUX event code was not updated accordingly. The flag ends up holding a CPU number. If this number is zero, the flag is taken as false and the IRQ work is never triggered. Later, with commit: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") a new IRQ work 'pending_disable_irq' was introduced to handle event disabling. The AUX event path was not updated to kick off the work queue. To fix this bug, when an AUX ring buffer overrun is detected, call perf_event_disable_inatomic() to initiate the pending disable flow. Also update the outdated comment for setting the flag, to reflect the boolean values (0 or 1). Fixes: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") Fixes: ca6c21327c6a ("perf: Fix missing SIGTRAPs") Signed-off-by: Leo Yan Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Yeoreum Yun Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Liang Kan Cc: Marco Elver Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250625170737.2918295-1-leo.yan@arm.com --- kernel/events/core.c | 6 +++--- kernel/events/ring_buffer.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1f746469fda5..7281230044d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7251,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d2aef87c7e9f..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } From 8d89661a36dd3bb8c9902cff36dc0c144dce3faf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 11:32:58 -0700 Subject: [PATCH 268/297] net: selftests: fix TCP packet checksum The length in the pseudo header should be the length of the L3 payload AKA the L4 header+payload. The selftest code builds the packet from the lower layers up, so all the headers are pushed already when it constructs L4. We need to subtract the lower layer headers from skb->len. Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Jakub Kicinski Reviewed-by: Gerhard Engleder Reported-by: Oleksij Rempel Tested-by: Oleksij Rempel Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250624183258.3377740-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/selftests.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/selftests.c b/net/core/selftests.c index 35f807ea9952..406faf8e5f3f 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -160,8 +160,9 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; if (attr->tcp) { - thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, - ihdr->daddr, 0); + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { From 85720e04d9af0b77f8092b12a06661a8d459d4a0 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 25 Jun 2025 10:39:24 +0800 Subject: [PATCH 269/297] net: libwx: fix the creation of page_pool 'rx_ring->size' means the count of ring descriptors multiplied by the size of one descriptor. When increasing the count of ring descriptors, it may exceed the limit of pool size. [ 864.209610] page_pool_create_percpu() gave up with errno -7 [ 864.209613] txgbe 0000:11:00.0: Page pool creation failed: -7 Fix to set the pool_size to the count of ring descriptors. Fixes: 850b971110b2 ("net: libwx: Allocate Rx and Tx resources") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Reviewed-by: Mina Almasry Link: https://patch.msgid.link/434C72BFB40E350A+20250625023924.21821-1-jiawenwu@trustnetic.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 7f2e6cddfeb1..c57cc4f27249 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2623,7 +2623,7 @@ static int wx_alloc_page_pool(struct wx_ring *rx_ring) struct page_pool_params pp_params = { .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, .order = 0, - .pool_size = rx_ring->size, + .pool_size = rx_ring->count, .nid = dev_to_node(rx_ring->dev), .dev = rx_ring->dev, .dma_dir = DMA_FROM_DEVICE, From dd2c18548964ae7ad48d208a765d909cd35448a1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Jun 2025 06:50:47 +0200 Subject: [PATCH 270/297] nvme: reset delayed remove_work after reconnect The remove_work will proceed with permanently disconnecting on the initial final path failure if the head shows no paths after the delay. If a new path connects while the remove_work is pending, and if that new path happens to disconnect before that remove_work executes, the delayed removal should reset based on the most recent path disconnect time, but queue_delayed_work() won't do anything if the work is already pending. Attempt to cancel the delayed work when a new path connects, and use mod_delayed_work() in case the remove_work remains pending anyway. Signed-off-by: Keith Busch Reviewed-by: Nilay Shroff Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/multipath.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 92697f98c601..724f5732786c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4036,6 +4036,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) list_add_tail_rcu(&ns->siblings, &head->list); ns->head = head; mutex_unlock(&ctrl->subsys->lock); + +#ifdef CONFIG_NVME_MULTIPATH + cancel_delayed_work(&head->remove_work); +#endif return 0; out_put_ns_head: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 140079ff86e6..1062467595f3 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1311,7 +1311,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) */ if (!try_module_get(THIS_MODULE)) goto out; - queue_delayed_work(nvme_wq, &head->remove_work, + mod_delayed_work(nvme_wq, &head->remove_work, head->delayed_removal_secs * HZ); } else { list_del_init(&head->entry); From b2e607fecac15e07f50269c080e2e71b5049dfa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2025 07:09:21 +0200 Subject: [PATCH 271/297] nvme: refactor the atomic write unit detection Move all the code out of nvme_update_disk_info into the helper, and rename the helper to have a somewhat less clumsy name. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: John Garry --- drivers/nvme/host/core.c | 72 +++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 724f5732786c..520fb5f1e214 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2015,21 +2015,51 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } -static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, - struct nvme_id_ns *id, struct queue_limits *lim, - u32 bs, u32 atomic_bs) +static u32 nvme_configure_atomic_write(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, u32 bs) { - unsigned int boundary = 0; + u32 atomic_bs, boundary = 0; - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { - if (le16_to_cpu(id->nabspf)) + /* + * We do not support an offset for the atomic boundaries. + */ + if (id->nabo) + return bs; + + if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { + /* + * Use the per-namespace atomic write unit when available. + */ + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + if (id->nabspf) boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } else { + /* + * Use the controller wide atomic write unit. This sucks + * because the limit is defined in terms of logical blocks while + * namespaces can have different formats, and because there is + * no clear language in the specification prohibiting different + * values for different controllers in the subsystem. + */ + atomic_bs = (1 + ns->ctrl->awupf) * bs; } + + if (!ns->ctrl->subsys->atomic_bs) { + ns->ctrl->subsys->atomic_bs = atomic_bs; + } else if (ns->ctrl->subsys->atomic_bs != atomic_bs) { + dev_err_ratelimited(ns->ctrl->device, + "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", + ns->disk ? ns->disk->disk_name : "?", + ns->ctrl->subsys->atomic_bs, + atomic_bs); + } + lim->atomic_write_hw_max = atomic_bs; lim->atomic_write_hw_boundary = boundary; lim->atomic_write_hw_unit_min = bs; lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); lim->features |= BLK_FEAT_ATOMIC_WRITES; + return atomic_bs; } static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) @@ -2067,34 +2097,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, valid = false; } - atomic_bs = phys_bs = bs; - if (id->nabo == 0) { - /* - * Bit 1 indicates whether NAWUPF is defined for this namespace - * and whether it should be used instead of AWUPF. If NAWUPF == - * 0 then AWUPF must be used instead. - */ - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; - else - atomic_bs = (1 + ns->ctrl->awupf) * bs; - - /* - * Set subsystem atomic bs. - */ - if (ns->ctrl->subsys->atomic_bs) { - if (atomic_bs != ns->ctrl->subsys->atomic_bs) { - dev_err_ratelimited(ns->ctrl->device, - "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", - ns->disk ? ns->disk->disk_name : "?", - ns->ctrl->subsys->atomic_bs, - atomic_bs); - } - } else - ns->ctrl->subsys->atomic_bs = atomic_bs; - - nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); - } + phys_bs = bs; + atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ From f46d273449ba65afd53f3dd8fe0182c9df877e08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2025 06:54:56 +0200 Subject: [PATCH 272/297] nvme: fix atomic write size validation Don't mix the namespace and controller values, and validate the per-controller limit when probing the controller. This avoid spurious failures for controllers with namespaces that have different namespaces with different logical block sizes, or report the per-namespace values only for some namespaces. It also fixes a missing queue_limits_cancel_update in an error path by removing that error path. Fixes: 8695f060a029 ("nvme: all namespaces in a subsystem must adhere to a common atomic write size") Reported-by: Yi Zhang Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: John Garry Tested-by: Yi Zhang --- drivers/nvme/host/core.c | 33 +++++++++++---------------------- drivers/nvme/host/nvme.h | 3 +-- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 520fb5f1e214..e533d791955d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2041,17 +2041,7 @@ static u32 nvme_configure_atomic_write(struct nvme_ns *ns, * no clear language in the specification prohibiting different * values for different controllers in the subsystem. */ - atomic_bs = (1 + ns->ctrl->awupf) * bs; - } - - if (!ns->ctrl->subsys->atomic_bs) { - ns->ctrl->subsys->atomic_bs = atomic_bs; - } else if (ns->ctrl->subsys->atomic_bs != atomic_bs) { - dev_err_ratelimited(ns->ctrl->device, - "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", - ns->disk ? ns->disk->disk_name : "?", - ns->ctrl->subsys->atomic_bs, - atomic_bs); + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } lim->atomic_write_hw_max = atomic_bs; @@ -2386,16 +2376,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; - /* - * Validate the max atomic write size fits within the subsystem's - * atomic write capabilities. - */ - if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) { - blk_mq_unfreeze_queue(ns->disk->queue, memflags); - ret = -ENXIO; - goto out; - } - nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) @@ -3219,6 +3199,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->model, id->mn, sizeof(subsys->model)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); /* Versions prior to 1.4 don't necessarily report a valid type */ if (id->cntrltype == NVME_CTRL_DISC || @@ -3556,6 +3537,15 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret) goto out_free; } + + if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) { + dev_err_ratelimited(ctrl->device, + "inconsistent AWUPF, controller not added (%u/%u).\n", + le16_to_cpu(id->awupf), ctrl->subsys->awupf); + ret = -EINVAL; + goto out_free; + } + memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev)); @@ -3651,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); - ctrl->awupf = le16_to_cpu(id->awupf); out_free: kfree(id); return ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a468cdc5b5cb..7df2ea21851f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -410,7 +410,6 @@ struct nvme_ctrl { enum nvme_ctrl_type cntrltype; enum nvme_dctype dctype; - u16 awupf; /* 0's based value. */ }; static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) @@ -443,11 +442,11 @@ struct nvme_subsystem { u8 cmic; enum nvme_subsys_type subtype; u16 vendor_id; + u16 awupf; /* 0's based value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; #endif - u32 atomic_bs; }; /* From f0ef0b02af381418973a952867052b12194f9c16 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 26 Jun 2025 20:07:10 +0800 Subject: [PATCH 273/297] LoongArch: Replace __ASSEMBLY__ with __ASSEMBLER__ in headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembler code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This is bad since macros starting with two underscores are names that are reserved by the C language. It can also be very confusing for the developers when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's now standardize on the __ASSEMBLER__ macro that is provided by the compilers. This is almost a completely mechanical patch (done with a simple "sed -i" statement), with one comment tweaked manually in the arch/loongarch/include/asm/cpu.h file (it was missing the trailing underscores). Signed-off-by: Thomas Huth Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/addrspace.h | 8 ++++---- arch/loongarch/include/asm/alternative-asm.h | 4 ++-- arch/loongarch/include/asm/alternative.h | 4 ++-- arch/loongarch/include/asm/asm-extable.h | 6 +++--- arch/loongarch/include/asm/asm.h | 8 ++++---- arch/loongarch/include/asm/cpu.h | 4 ++-- arch/loongarch/include/asm/ftrace.h | 4 ++-- arch/loongarch/include/asm/gpr-num.h | 6 +++--- arch/loongarch/include/asm/irqflags.h | 4 ++-- arch/loongarch/include/asm/jump_label.h | 4 ++-- arch/loongarch/include/asm/kasan.h | 2 +- arch/loongarch/include/asm/loongarch.h | 16 ++++++++-------- arch/loongarch/include/asm/orc_types.h | 4 ++-- arch/loongarch/include/asm/page.h | 4 ++-- arch/loongarch/include/asm/pgtable-bits.h | 4 ++-- arch/loongarch/include/asm/pgtable.h | 4 ++-- arch/loongarch/include/asm/prefetch.h | 2 +- arch/loongarch/include/asm/thread_info.h | 4 ++-- arch/loongarch/include/asm/types.h | 2 +- arch/loongarch/include/asm/unwind_hints.h | 6 +++--- arch/loongarch/include/asm/vdso/arch_data.h | 4 ++-- arch/loongarch/include/asm/vdso/getrandom.h | 4 ++-- arch/loongarch/include/asm/vdso/gettimeofday.h | 4 ++-- arch/loongarch/include/asm/vdso/processor.h | 4 ++-- arch/loongarch/include/asm/vdso/vdso.h | 4 ++-- arch/loongarch/include/asm/vdso/vsyscall.h | 4 ++-- tools/arch/loongarch/include/asm/orc_types.h | 4 ++-- 27 files changed, 64 insertions(+), 64 deletions(-) diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index fe198b473f84..e739dbc6329d 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -18,12 +18,12 @@ /* * This gives the physical RAM offset. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef PHYS_OFFSET #define PHYS_OFFSET _UL(0) #endif extern unsigned long vm_map_base; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifndef IO_BASE #define IO_BASE CSR_DMW0_BASE @@ -66,7 +66,7 @@ extern unsigned long vm_map_base; #define FIXADDR_TOP ((unsigned long)(long)(int)0xfffe0000) #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _ATYPE_ #define _ATYPE32_ #define _ATYPE64_ @@ -85,7 +85,7 @@ extern unsigned long vm_map_base; /* * 32/64-bit LoongArch address spaces */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _ACAST32_ #define _ACAST64_ #else diff --git a/arch/loongarch/include/asm/alternative-asm.h b/arch/loongarch/include/asm/alternative-asm.h index ff3d10ac393f..7dc29bd9b2f0 100644 --- a/arch/loongarch/include/asm/alternative-asm.h +++ b/arch/loongarch/include/asm/alternative-asm.h @@ -2,7 +2,7 @@ #ifndef _ASM_ALTERNATIVE_ASM_H #define _ASM_ALTERNATIVE_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include @@ -77,6 +77,6 @@ .previous .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_ALTERNATIVE_ASM_H */ diff --git a/arch/loongarch/include/asm/alternative.h b/arch/loongarch/include/asm/alternative.h index cee7b29785ab..b5bae21fb3c8 100644 --- a/arch/loongarch/include/asm/alternative.h +++ b/arch/loongarch/include/asm/alternative.h @@ -2,7 +2,7 @@ #ifndef _ASM_ALTERNATIVE_H #define _ASM_ALTERNATIVE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -106,6 +106,6 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ (asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_ALTERNATIVE_H */ diff --git a/arch/loongarch/include/asm/asm-extable.h b/arch/loongarch/include/asm/asm-extable.h index df05005f2b80..d60bdf2e6377 100644 --- a/arch/loongarch/include/asm/asm-extable.h +++ b/arch/loongarch/include/asm/asm-extable.h @@ -7,7 +7,7 @@ #define EX_TYPE_UACCESS_ERR_ZERO 2 #define EX_TYPE_BPF 3 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ .pushsection __ex_table, "a"; \ @@ -22,7 +22,7 @@ __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0) .endm -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #include #include @@ -60,6 +60,6 @@ #define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \ _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_ASM_EXTABLE_H */ diff --git a/arch/loongarch/include/asm/asm.h b/arch/loongarch/include/asm/asm.h index f591b3245def..f018d26fc995 100644 --- a/arch/loongarch/include/asm/asm.h +++ b/arch/loongarch/include/asm/asm.h @@ -110,7 +110,7 @@ #define LONG_SRA srai.w #define LONG_SRAV sra.w -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define LONG .word #endif #define LONGSIZE 4 @@ -131,7 +131,7 @@ #define LONG_SRA srai.d #define LONG_SRAV sra.d -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define LONG .dword #endif #define LONGSIZE 8 @@ -158,7 +158,7 @@ #define PTR_SCALESHIFT 2 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define PTR .word #endif #define PTRSIZE 4 @@ -181,7 +181,7 @@ #define PTR_SCALESHIFT 3 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define PTR .dword #endif #define PTRSIZE 8 diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index 98cf4d7b4b0a..dfb982fe8701 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -46,7 +46,7 @@ #define PRID_PRODUCT_MASK 0x0fff -#if !defined(__ASSEMBLY__) +#if !defined(__ASSEMBLER__) enum cpu_type_enum { CPU_UNKNOWN, @@ -55,7 +55,7 @@ enum cpu_type_enum { CPU_LAST }; -#endif /* !__ASSEMBLY */ +#endif /* !__ASSEMBLER__ */ /* * ISA Level encodings diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index 6e0a99763a9a..f4caaf764f9e 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -14,7 +14,7 @@ #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef CONFIG_DYNAMIC_FTRACE @@ -84,7 +84,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/loongarch/include/asm/gpr-num.h b/arch/loongarch/include/asm/gpr-num.h index 996038da806d..af95b941f48b 100644 --- a/arch/loongarch/include/asm/gpr-num.h +++ b/arch/loongarch/include/asm/gpr-num.h @@ -2,7 +2,7 @@ #ifndef __ASM_GPR_NUM_H #define __ASM_GPR_NUM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .equ .L__gpr_num_zero, 0 .irp num,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 @@ -25,7 +25,7 @@ .equ .L__gpr_num_$s\num, 23 + \num .endr -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __DEFINE_ASM_GPR_NUMS \ " .equ .L__gpr_num_zero, 0\n" \ @@ -47,6 +47,6 @@ " .equ .L__gpr_num_$s\\num, 23 + \\num\n" \ " .endr\n" \ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_GPR_NUM_H */ diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h index 003172b8406b..620163628a7f 100644 --- a/arch/loongarch/include/asm/irqflags.h +++ b/arch/loongarch/include/asm/irqflags.h @@ -5,7 +5,7 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -80,6 +80,6 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(arch_local_save_flags()); } -#endif /* #ifndef __ASSEMBLY__ */ +#endif /* #ifndef __ASSEMBLER__ */ #endif /* _ASM_IRQFLAGS_H */ diff --git a/arch/loongarch/include/asm/jump_label.h b/arch/loongarch/include/asm/jump_label.h index 8a924bd69d19..4000c7603d8e 100644 --- a/arch/loongarch/include/asm/jump_label.h +++ b/arch/loongarch/include/asm/jump_label.h @@ -7,7 +7,7 @@ #ifndef __ASM_JUMP_LABEL_H #define __ASM_JUMP_LABEL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -50,5 +50,5 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke return true; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_JUMP_LABEL_H */ diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h index 7f52bd31b9d4..62f139a9c87d 100644 --- a/arch/loongarch/include/asm/kasan.h +++ b/arch/loongarch/include/asm/kasan.h @@ -2,7 +2,7 @@ #ifndef __ASM_KASAN_H #define __ASM_KASAN_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index d84dac88a584..a0994d226eff 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -9,15 +9,15 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* CPUCFG */ #define read_cpucfg(reg) __cpucfg(reg) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* LoongArch Registers */ #define REG_ZERO 0x0 @@ -53,7 +53,7 @@ #define REG_S7 0x1e #define REG_S8 0x1f -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* Bit fields for CPUCFG registers */ #define LOONGARCH_CPUCFG0 0x0 @@ -171,7 +171,7 @@ * SW emulation for KVM hypervirsor, see arch/loongarch/include/uapi/asm/kvm_para.h */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* CSR */ #define csr_read32(reg) __csrrd_w(reg) @@ -187,7 +187,7 @@ #define iocsr_write32(val, reg) __iocsrwr_w(val, reg) #define iocsr_write64(val, reg) __iocsrwr_d(val, reg) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* CSR register number */ @@ -1195,7 +1195,7 @@ #define LOONGARCH_IOCSR_EXTIOI_ROUTE_BASE 0x1c00 #define IOCSR_EXTIOI_VECTOR_NUM 256 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static __always_inline u64 drdtime(void) { @@ -1357,7 +1357,7 @@ __BUILD_CSR_OP(tlbidx) #define clear_csr_estat(val) \ csr_xchg32(~(val), val, LOONGARCH_CSR_ESTAT) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* Generic EntryLo bit definitions */ #define ENTRYLO_V (_ULCAST_(1) << 0) diff --git a/arch/loongarch/include/asm/orc_types.h b/arch/loongarch/include/asm/orc_types.h index caf1f71a1057..d5fa98d1d177 100644 --- a/arch/loongarch/include/asm/orc_types.h +++ b/arch/loongarch/include/asm/orc_types.h @@ -34,7 +34,7 @@ #define ORC_TYPE_REGS 3 #define ORC_TYPE_REGS_PARTIAL 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -53,6 +53,6 @@ struct orc_entry { unsigned int type:3; unsigned int signal:1; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 7368f12b7cb1..a3aaf34fba16 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -15,7 +15,7 @@ #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -110,6 +110,6 @@ extern int __virt_addr_valid(volatile void *kaddr); #include #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_PAGE_H */ diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 45bfc65a0c9f..7bbfb04a54cc 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -92,7 +92,7 @@ #define PAGE_KERNEL_WUC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _PAGE_KERN | _CACHE_WUC) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_SUC) @@ -127,6 +127,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot) return __pgprot(prot); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_PGTABLE_BITS_H */ diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index b30185302c07..f2aeff544cee 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -55,7 +55,7 @@ #define USER_PTRS_PER_PGD ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -618,6 +618,6 @@ static inline long pmd_protnone(pmd_t pmd) #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_PGTABLE_H */ diff --git a/arch/loongarch/include/asm/prefetch.h b/arch/loongarch/include/asm/prefetch.h index 1672262a5e2e..0b168cdaae9a 100644 --- a/arch/loongarch/include/asm/prefetch.h +++ b/arch/loongarch/include/asm/prefetch.h @@ -8,7 +8,7 @@ #define Pref_Load 0 #define Pref_Store 8 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro __pref hint addr #ifdef CONFIG_CPU_HAS_PREFETCH diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 4f5a9441754e..9dfa2ef00816 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -10,7 +10,7 @@ #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -53,7 +53,7 @@ static inline struct thread_info *current_thread_info(void) register unsigned long current_stack_pointer __asm__("$sp"); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* thread information allocation */ #define THREAD_SIZE SZ_16K diff --git a/arch/loongarch/include/asm/types.h b/arch/loongarch/include/asm/types.h index baf15a0dcf8b..0edd731f3d6a 100644 --- a/arch/loongarch/include/asm/types.h +++ b/arch/loongarch/include/asm/types.h @@ -8,7 +8,7 @@ #include #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _ULCAST_ #define _U64CAST_ #else diff --git a/arch/loongarch/include/asm/unwind_hints.h b/arch/loongarch/include/asm/unwind_hints.h index 2c68bc72736c..16c7f7e465a0 100644 --- a/arch/loongarch/include/asm/unwind_hints.h +++ b/arch/loongarch/include/asm/unwind_hints.h @@ -5,7 +5,7 @@ #include #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro UNWIND_HINT_UNDEFINED UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED @@ -23,7 +23,7 @@ UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_CALL .endm -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define UNWIND_HINT_SAVE \ UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) @@ -31,6 +31,6 @@ #define UNWIND_HINT_RESTORE \ UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_LOONGARCH_UNWIND_HINTS_H */ diff --git a/arch/loongarch/include/asm/vdso/arch_data.h b/arch/loongarch/include/asm/vdso/arch_data.h index 322d0a5f1c84..395ec223bcbe 100644 --- a/arch/loongarch/include/asm/vdso/arch_data.h +++ b/arch/loongarch/include/asm/vdso/arch_data.h @@ -7,7 +7,7 @@ #ifndef _VDSO_ARCH_DATA_H #define _VDSO_ARCH_DATA_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -20,6 +20,6 @@ struct vdso_arch_data { struct vdso_pcpu_data pdata[NR_CPUS]; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h index a81724b69f29..2ff05003c6e7 100644 --- a/arch/loongarch/include/asm/vdso/getrandom.h +++ b/arch/loongarch/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_GETRANDOM_H #define __ASM_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -28,6 +28,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index f15503e3336c..dcafabca9bb6 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -7,7 +7,7 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -89,6 +89,6 @@ static inline bool loongarch_vdso_hres_capable(void) } #define __arch_vdso_hres_capable loongarch_vdso_hres_capable -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/loongarch/include/asm/vdso/processor.h b/arch/loongarch/include/asm/vdso/processor.h index ef5770b343a0..1e255373b0b8 100644 --- a/arch/loongarch/include/asm/vdso/processor.h +++ b/arch/loongarch/include/asm/vdso/processor.h @@ -5,10 +5,10 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define cpu_relax() barrier() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h index 50c65fb29daf..04bd2d452876 100644 --- a/arch/loongarch/include/asm/vdso/vdso.h +++ b/arch/loongarch/include/asm/vdso/vdso.h @@ -7,7 +7,7 @@ #ifndef _ASM_VDSO_VDSO_H #define _ASM_VDSO_VDSO_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -16,6 +16,6 @@ #define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h index 1140b54b4bc8..558eb9dfda52 100644 --- a/arch/loongarch/include/asm/vdso/vsyscall.h +++ b/arch/loongarch/include/asm/vdso/vsyscall.h @@ -2,13 +2,13 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* The asm-generic header needs to be included after the definitions above */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/tools/arch/loongarch/include/asm/orc_types.h b/tools/arch/loongarch/include/asm/orc_types.h index caf1f71a1057..d5fa98d1d177 100644 --- a/tools/arch/loongarch/include/asm/orc_types.h +++ b/tools/arch/loongarch/include/asm/orc_types.h @@ -34,7 +34,7 @@ #define ORC_TYPE_REGS 3 #define ORC_TYPE_REGS_PARTIAL 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -53,6 +53,6 @@ struct orc_entry { unsigned int type:3; unsigned int signal:1; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ From 7d69294b8a8d0e19600797711ed3bc047fead1c1 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 26 Jun 2025 20:07:18 +0800 Subject: [PATCH 274/297] LoongArch: Fix build warnings about export.h After commit a934a57a42f64a4 ("scripts/misc-check: check missing #include when W=1") and 7d95680d64ac8e836c ("scripts/misc-check: check unnecessary #include when W=1"), we get some build warnings with W=1: arch/loongarch/kernel/acpi.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/alternative.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/kfpu.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/traps.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/unwind_guess.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/unwind_orc.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/unwind_prologue.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/lib/crc32-loongarch.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/lib/csum.c: warning: EXPORT_SYMBOL() is used, but #include is missing arch/loongarch/kernel/elf.c: warning: EXPORT_SYMBOL() is not used, but #include is present arch/loongarch/kernel/paravirt.c: warning: EXPORT_SYMBOL() is not used, but #include is present arch/loongarch/pci/pci.c: warning: EXPORT_SYMBOL() is not used, but #include is present So fix these build warnings for LoongArch. Reviewed-by: Masahiro Yamada Signed-off-by: Huacai Chen --- arch/loongarch/kernel/acpi.c | 1 + arch/loongarch/kernel/alternative.c | 1 + arch/loongarch/kernel/elf.c | 1 - arch/loongarch/kernel/kfpu.c | 1 + arch/loongarch/kernel/paravirt.c | 1 - arch/loongarch/kernel/traps.c | 1 + arch/loongarch/kernel/unwind_guess.c | 1 + arch/loongarch/kernel/unwind_orc.c | 3 ++- arch/loongarch/kernel/unwind_prologue.c | 1 + arch/loongarch/lib/crc32-loongarch.c | 1 + arch/loongarch/lib/csum.c | 1 + arch/loongarch/pci/pci.c | 1 - 12 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index a54cd6fd3796..1367ca759468 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/loongarch/kernel/alternative.c b/arch/loongarch/kernel/alternative.c index 4ad13847e962..0e0c766df1e3 100644 --- a/arch/loongarch/kernel/alternative.c +++ b/arch/loongarch/kernel/alternative.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include diff --git a/arch/loongarch/kernel/elf.c b/arch/loongarch/kernel/elf.c index 0fa81ced28dc..3d98c6aa00db 100644 --- a/arch/loongarch/kernel/elf.c +++ b/arch/loongarch/kernel/elf.c @@ -6,7 +6,6 @@ #include #include -#include #include #include diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c index 4c476904227f..141b49bd989c 100644 --- a/arch/loongarch/kernel/kfpu.c +++ b/arch/loongarch/kernel/kfpu.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c index e5a39bbad078..b1b51f920b23 100644 --- a/arch/loongarch/kernel/paravirt.c +++ b/arch/loongarch/kernel/paravirt.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include #include #include diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 47fc2de6d150..3d9be6ca7ec5 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/loongarch/kernel/unwind_guess.c b/arch/loongarch/kernel/unwind_guess.c index 98379b7d4147..08d7951b2f60 100644 --- a/arch/loongarch/kernel/unwind_guess.c +++ b/arch/loongarch/kernel/unwind_guess.c @@ -3,6 +3,7 @@ * Copyright (C) 2022 Loongson Technology Corporation Limited */ #include +#include unsigned long unwind_get_return_address(struct unwind_state *state) { diff --git a/arch/loongarch/kernel/unwind_orc.c b/arch/loongarch/kernel/unwind_orc.c index d623935a7547..0005be49b056 100644 --- a/arch/loongarch/kernel/unwind_orc.c +++ b/arch/loongarch/kernel/unwind_orc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include +#include #include #include #include diff --git a/arch/loongarch/kernel/unwind_prologue.c b/arch/loongarch/kernel/unwind_prologue.c index 929ae240280a..729e775bd40d 100644 --- a/arch/loongarch/kernel/unwind_prologue.c +++ b/arch/loongarch/kernel/unwind_prologue.c @@ -3,6 +3,7 @@ * Copyright (C) 2022 Loongson Technology Corporation Limited */ #include +#include #include #include diff --git a/arch/loongarch/lib/crc32-loongarch.c b/arch/loongarch/lib/crc32-loongarch.c index b37cd8537b45..db22c2ec55e2 100644 --- a/arch/loongarch/lib/crc32-loongarch.c +++ b/arch/loongarch/lib/crc32-loongarch.c @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c index df309ae4045d..bcc9d01d8c41 100644 --- a/arch/loongarch/lib/csum.c +++ b/arch/loongarch/lib/csum.c @@ -2,6 +2,7 @@ // Copyright (C) 2019-2020 Arm Ltd. #include +#include #include #include diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index 2726639150bc..5bc9627a6cf9 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -3,7 +3,6 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include -#include #include #include #include From 39503fc84b4ea94f2bedca481de5e225e0df729d Mon Sep 17 00:00:00 2001 From: Ming Wang Date: Thu, 26 Jun 2025 20:07:18 +0800 Subject: [PATCH 275/297] LoongArch: Reserve the EFI memory map region The EFI memory map at 'boot_memmap' is crucial for kdump to understand the primary kernel's memory layout. This memory region, typically part of EFI Boot Services (BS) data, can be overwritten after ExitBootServices if not explicitly preserved by the kernel. This commit addresses this by: 1. Calling memblock_reserve() to reserve the entire physical region occupied by the EFI memory map (header + descriptors). This prevents the primary kernel from reallocating and corrupting this area. 2. Setting the EFI_PRESERVE_BS_REGIONS flag in efi.flags. This indicates that efforts have been made to preserve critical BS code/data regions which can be useful for other kernel subsystems or debugging. These changes ensure the original EFI memory map data remains intact, improving kdump reliability and potentially aiding other EFI-related functionalities that might rely on preserved BS code/data. Signed-off-by: Ming Wang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/efi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/loongarch/kernel/efi.c b/arch/loongarch/kernel/efi.c index de21e72759ee..860a3bc030e0 100644 --- a/arch/loongarch/kernel/efi.c +++ b/arch/loongarch/kernel/efi.c @@ -144,6 +144,18 @@ void __init efi_init(void) if (efi_memmap_init_early(&data) < 0) panic("Unable to map EFI memory map.\n"); + /* + * Reserve the physical memory region occupied by the EFI + * memory map table (header + descriptors). This is crucial + * for kdump, as the kdump kernel relies on this original + * memmap passed by the bootloader. Without reservation, + * this region could be overwritten by the primary kernel. + * Also, set the EFI_PRESERVE_BS_REGIONS flag to indicate that + * critical boot services code/data regions like this are preserved. + */ + memblock_reserve((phys_addr_t)boot_memmap, sizeof(*tbl) + data.size); + set_bit(EFI_PRESERVE_BS_REGIONS, &efi.flags); + early_memunmap(tbl, sizeof(*tbl)); } From a0137c9048252ea965a0436895c77d1c917dfe2a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Jun 2025 20:07:18 +0800 Subject: [PATCH 276/297] LoongArch: Handle KCOV __init vs inline mismatches When the KCOV is enabled all functions get instrumented, unless the __no_sanitize_coverage attribute is used. To prepare for __no_sanitize_coverage being applied to __init functions, we have to handle differences in how GCC's inline optimizations get resolved. For LoongArch this exposed several places where __init annotations were missing but ended up being "accidentally correct". So fix these cases. Signed-off-by: Kees Cook Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/smp.h | 2 +- arch/loongarch/kernel/time.c | 2 +- arch/loongarch/mm/ioremap.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index ad0bd234a0f1..3a47f52959a8 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -39,7 +39,7 @@ int loongson_cpu_disable(void); void loongson_cpu_die(unsigned int cpu); #endif -static inline void plat_smp_setup(void) +static inline void __init plat_smp_setup(void) { loongson_smp_setup(); } diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index bc75a3a69fc8..367906b10f81 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -102,7 +102,7 @@ static int constant_timer_next_event(unsigned long delta, struct clock_event_dev return 0; } -static unsigned long __init get_loops_per_jiffy(void) +static unsigned long get_loops_per_jiffy(void) { unsigned long lpj = (unsigned long)const_clock_freq; diff --git a/arch/loongarch/mm/ioremap.c b/arch/loongarch/mm/ioremap.c index 70ca73019811..df949a3d0f34 100644 --- a/arch/loongarch/mm/ioremap.c +++ b/arch/loongarch/mm/ioremap.c @@ -16,12 +16,12 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } -void *early_memremap_ro(resource_size_t phys_addr, unsigned long size) +void * __init early_memremap_ro(resource_size_t phys_addr, unsigned long size) { return early_memremap(phys_addr, size); } -void *early_memremap_prot(resource_size_t phys_addr, unsigned long size, +void * __init early_memremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { return early_memremap(phys_addr, size); From 080e8d2ecdfde588897aa8a87a8884061f4dbbbb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 26 Jun 2025 20:07:27 +0800 Subject: [PATCH 277/297] LoongArch: KVM: Avoid overflow with array index The variable index is modified and reused as array index when modify register EIOINTC_ENABLE. There will be array index overflow problem. Cc: stable@vger.kernel.org Fixes: 3956a52bc05b ("LoongArch: KVM: Add EIOINTC read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index f39929d7bf8a..9c47456b805c 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -436,17 +436,16 @@ static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu, break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: index = (offset - EIOINTC_ENABLE_START) >> 1; - old_data = s->enable.reg_u32[index]; + old_data = s->enable.reg_u16[index]; s->enable.reg_u16[index] = data; /* * 1: enable irq. * update irq when isr is set. */ data = s->enable.reg_u16[index] & ~old_data & s->isr.reg_u16[index]; - index = index << 1; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index + i, mask, 1); + eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 1); } /* * 0: disable irq. @@ -455,7 +454,7 @@ static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu, data = ~s->enable.reg_u16[index] & old_data & s->isr.reg_u16[index]; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index, mask, 0); + eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 0); } break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: @@ -529,10 +528,9 @@ static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu, * update irq when isr is set. */ data = s->enable.reg_u32[index] & ~old_data & s->isr.reg_u32[index]; - index = index << 2; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index + i, mask, 1); + eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 1); } /* * 0: disable irq. @@ -541,7 +539,7 @@ static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu, data = ~s->enable.reg_u32[index] & old_data & s->isr.reg_u32[index]; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index, mask, 0); + eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 0); } break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: @@ -615,10 +613,9 @@ static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu, * update irq when isr is set. */ data = s->enable.reg_u64[index] & ~old_data & s->isr.reg_u64[index]; - index = index << 3; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index + i, mask, 1); + eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 1); } /* * 0: disable irq. @@ -627,7 +624,7 @@ static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu, data = ~s->enable.reg_u64[index] & old_data & s->isr.reg_u64[index]; for (i = 0; i < sizeof(data); i++) { u8 mask = (data >> (i * 8)) & 0xff; - eiointc_enable_irq(vcpu, s, index, mask, 0); + eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 0); } break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: From a4b1b51ae132ac199412028a2df7b6c267888190 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 6 Jun 2025 11:45:47 +0100 Subject: [PATCH 278/297] drm/xe: Move DSB l2 flush to a more sensible place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flushing l2 is only needed after all data has been written. Fixes: 01570b446939 ("drm/xe/bmg: implement Wa_16023588340") Signed-off-by: Maarten Lankhorst Cc: Matthew Auld Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Matthew Auld Signed-off-by: Matthew Auld Reviewed-by: Lucas De Marchi Reviewed-by: Ville Syrjälä Link: https://lore.kernel.org/r/20250606104546.1996818-3-matthew.auld@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 0dd2dd0182bc444a62652e89d08c7f0e4fde15ba) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/display/xe_dsb_buffer.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c index f95375451e2f..9f941fc2e36b 100644 --- a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c @@ -17,10 +17,7 @@ u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) { - struct xe_device *xe = dsb_buf->vma->bo->tile->xe; - iosys_map_wr(&dsb_buf->vma->bo->vmap, idx * 4, u32, val); - xe_device_l2_flush(xe); } u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) @@ -30,12 +27,9 @@ u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) { - struct xe_device *xe = dsb_buf->vma->bo->tile->xe; - WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf->cmd_buf)); iosys_map_memset(&dsb_buf->vma->bo->vmap, idx * 4, val, size); - xe_device_l2_flush(xe); } bool intel_dsb_buffer_create(struct intel_crtc *crtc, struct intel_dsb_buffer *dsb_buf, size_t size) @@ -74,9 +68,12 @@ void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) { + struct xe_device *xe = dsb_buf->vma->bo->tile->xe; + /* * The memory barrier here is to ensure coherency of DSB vs MMIO, * both for weak ordering archs and discrete cards. */ - xe_device_wmb(dsb_buf->vma->bo->tile->xe); + xe_device_wmb(xe); + xe_device_l2_flush(xe); } From f16873f42a06b620669d48a4b5c3f888cb3653a1 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 6 Jun 2025 11:45:48 +0100 Subject: [PATCH 279/297] drm/xe: move DPT l2 flush to a more sensible place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only need the flush for DPT host updates here. Normal GGTT updates don't need special flush. Fixes: 01570b446939 ("drm/xe/bmg: implement Wa_16023588340") Signed-off-by: Matthew Auld Cc: Maarten Lankhorst Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Ville Syrjälä Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250606104546.1996818-4-matthew.auld@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 35db1da40c8cfd7511dc42f342a133601eb45449) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/display/xe_fb_pin.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c b/drivers/gpu/drm/xe/display/xe_fb_pin.c index d918ae1c8061..55259969480b 100644 --- a/drivers/gpu/drm/xe/display/xe_fb_pin.c +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c @@ -164,6 +164,9 @@ static int __xe_pin_fb_vma_dpt(const struct intel_framebuffer *fb, vma->dpt = dpt; vma->node = dpt->ggtt_node[tile0->id]; + + /* Ensure DPT writes are flushed */ + xe_device_l2_flush(xe); return 0; } @@ -333,8 +336,6 @@ static struct i915_vma *__xe_pin_fb_vma(const struct intel_framebuffer *fb, if (ret) goto err_unpin; - /* Ensure DPT writes are flushed */ - xe_device_l2_flush(xe); return vma; err_unpin: From ad40098da5c3b43114d860a5b5740e7204158534 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 13 Jun 2025 00:09:37 +0200 Subject: [PATCH 280/297] drm/xe/guc: Explicitly exit CT safe mode on unwind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During driver probe we might be briefly using CT safe mode, which is based on a delayed work, but usually we are able to stop this once we have IRQ fully operational. However, if we abort the probe quite early then during unwind we might try to destroy the workqueue while there is still a pending delayed work that attempts to restart itself which triggers a WARN. This was recently observed during unsuccessful VF initialization: [ ] xe 0000:00:02.1: probe with driver xe failed with error -62 [ ] ------------[ cut here ]------------ [ ] workqueue: cannot queue safe_mode_worker_func [xe] on wq xe-g2h-wq [ ] WARNING: CPU: 9 PID: 0 at kernel/workqueue.c:2257 __queue_work+0x287/0x710 [ ] RIP: 0010:__queue_work+0x287/0x710 [ ] Call Trace: [ ] delayed_work_timer_fn+0x19/0x30 [ ] call_timer_fn+0xa1/0x2a0 Exit the CT safe mode on unwind to avoid that warning. Fixes: 09b286950f29 ("drm/xe/guc: Allow CTB G2H processing without G2H IRQ") Signed-off-by: Michal Wajdeczko Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250612220937.857-3-michal.wajdeczko@intel.com (cherry picked from commit 2ddbb73ec20b98e70a5200cb85deade22ccea2ec) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_ct.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index d0ac48d8f4f7..bbcbb348256f 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -34,6 +34,11 @@ #include "xe_pm.h" #include "xe_trace_guc.h" +static void receive_g2h(struct xe_guc_ct *ct); +static void g2h_worker_func(struct work_struct *w); +static void safe_mode_worker_func(struct work_struct *w); +static void ct_exit_safe_mode(struct xe_guc_ct *ct); + #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) enum { /* Internal states, not error conditions */ @@ -186,14 +191,11 @@ static void guc_ct_fini(struct drm_device *drm, void *arg) { struct xe_guc_ct *ct = arg; + ct_exit_safe_mode(ct); destroy_workqueue(ct->g2h_wq); xa_destroy(&ct->fence_lookup); } -static void receive_g2h(struct xe_guc_ct *ct); -static void g2h_worker_func(struct work_struct *w); -static void safe_mode_worker_func(struct work_struct *w); - static void primelockdep(struct xe_guc_ct *ct) { if (!IS_ENABLED(CONFIG_LOCKDEP)) From af2b588abe006bd55ddd358c4c3b87523349c475 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 13 Jun 2025 00:09:36 +0200 Subject: [PATCH 281/297] drm/xe: Process deferred GGTT node removals on device unwind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While we are indirectly draining our dedicated workqueue ggtt->wq that we use to complete asynchronous removal of some GGTT nodes, this happends as part of the managed-drm unwinding (ggtt_fini_early), which could be later then manage-device unwinding, where we could already unmap our MMIO/GMS mapping (mmio_fini). This was recently observed during unsuccessful VF initialization: [ ] xe 0000:00:02.1: probe with driver xe failed with error -62 [ ] xe 0000:00:02.1: DEVRES REL ffff88811e747340 __xe_bo_unpin_map_no_vm (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e747540 __xe_bo_unpin_map_no_vm (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e747240 __xe_bo_unpin_map_no_vm (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e747040 tiles_fini (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e746840 mmio_fini (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e747f40 xe_bo_pinned_fini (16 bytes) [ ] xe 0000:00:02.1: DEVRES REL ffff88811e746b40 devm_drm_dev_init_release (16 bytes) [ ] xe 0000:00:02.1: [drm:drm_managed_release] drmres release begin [ ] xe 0000:00:02.1: [drm:drm_managed_release] REL ffff88810ef81640 __fini_relay (8 bytes) [ ] xe 0000:00:02.1: [drm:drm_managed_release] REL ffff88810ef80d40 guc_ct_fini (8 bytes) [ ] xe 0000:00:02.1: [drm:drm_managed_release] REL ffff88810ef80040 __drmm_mutex_release (8 bytes) [ ] xe 0000:00:02.1: [drm:drm_managed_release] REL ffff88810ef80140 ggtt_fini_early (8 bytes) and this was leading to: [ ] BUG: unable to handle page fault for address: ffffc900058162a0 [ ] #PF: supervisor write access in kernel mode [ ] #PF: error_code(0x0002) - not-present page [ ] Oops: Oops: 0002 [#1] SMP NOPTI [ ] Tainted: [W]=WARN [ ] Workqueue: xe-ggtt-wq ggtt_node_remove_work_func [xe] [ ] RIP: 0010:xe_ggtt_set_pte+0x6d/0x350 [xe] [ ] Call Trace: [ ] [ ] xe_ggtt_clear+0xb0/0x270 [xe] [ ] ggtt_node_remove+0xbb/0x120 [xe] [ ] ggtt_node_remove_work_func+0x30/0x50 [xe] [ ] process_one_work+0x22b/0x6f0 [ ] worker_thread+0x1e8/0x3d Add managed-device action that will explicitly drain the workqueue with all pending node removals prior to releasing MMIO/GSM mapping. Fixes: 919bb54e989c ("drm/xe: Fix missing runtime outer protection for ggtt_remove_node") Signed-off-by: Michal Wajdeczko Cc: Rodrigo Vivi Cc: Lucas De Marchi Reviewed-by: Rodrigo Vivi Link: https://lore.kernel.org/r/20250612220937.857-2-michal.wajdeczko@intel.com (cherry picked from commit 89d2835c3680ab1938e22ad81b1c9f8c686bd391) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_ggtt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c index 7062115909f2..2c799958c1e4 100644 --- a/drivers/gpu/drm/xe/xe_ggtt.c +++ b/drivers/gpu/drm/xe/xe_ggtt.c @@ -201,6 +201,13 @@ static const struct xe_ggtt_pt_ops xelpg_pt_wa_ops = { .ggtt_set_pte = xe_ggtt_set_pte_and_flush, }; +static void dev_fini_ggtt(void *arg) +{ + struct xe_ggtt *ggtt = arg; + + drain_workqueue(ggtt->wq); +} + /** * xe_ggtt_init_early - Early GGTT initialization * @ggtt: the &xe_ggtt to be initialized @@ -257,6 +264,10 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt) if (err) return err; + err = devm_add_action_or_reset(xe->drm.dev, dev_fini_ggtt, ggtt); + if (err) + return err; + if (IS_SRIOV_VF(xe)) { err = xe_gt_sriov_vf_prepare_ggtt(xe_tile_get_gt(ggtt->tile, 0)); if (err) From 969127bf0783a4ac0c8a27e633a9e8ea1738583f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 26 Jun 2025 12:20:45 +1000 Subject: [PATCH 282/297] ublk: sanity check add_dev input for underflow Add additional checks that queue depth and number of queues are non-zero. Signed-off-by: Ronnie Sahlberg Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250626022046.235018-1-ronniesahlberg@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d441d3259edb..c3e3c3b65a6d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2849,7 +2849,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) return -EINVAL; if (capable(CAP_SYS_ADMIN)) From c007062188d8e402c294117db53a24b2bed2b83f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 26 Jun 2025 19:57:43 +0800 Subject: [PATCH 283/297] block: fix false warning in bdev_count_inflight_rw() While bdev_count_inflight is interating all cpus, if some IOs are issued from traversed cpu and then completed from the cpu that is not traversed yet: cpu0 cpu1 bdev_count_inflight //for_each_possible_cpu // cpu0 is 0 infliht += 0 // issue a io blk_account_io_start // cpu0 inflight ++ cpu2 // the io is done blk_account_io_done // cpu2 inflight -- // cpu 1 is 0 inflight += 0 // cpu2 is -1 inflight += -1 ... In this case, the total inflight will be -1, causing lots of false warning. Fix the problem by removing the warning. Noted there is still a valid warning for nvme-mpath(From Yi) that is not fixed yet. Fixes: f5482ee5edb9 ("block: WARN if bdev inflight counter is negative") Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-block/aFtUXy-lct0WxY2w@mozart.vkv.me/T/#mae89155a5006463d0a21a4a2c35ae0034b26a339 Reported-and-tested-by: Calvin Owens Closes: https://lore.kernel.org/linux-block/aFtUXy-lct0WxY2w@mozart.vkv.me/T/#m1d935a00070bf95055d0ac84e6075158b08acaef Reported-by: Dave Chinner Closes: https://lore.kernel.org/linux-block/aFuypjqCXo9-5_En@dread.disaster.area/ Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250626115743.1641443-1-yukuai3@huawei.com Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 8171a6bc3210..c26733f6324b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -128,23 +128,27 @@ static void part_stat_read_all(struct block_device *part, static void bdev_count_inflight_rw(struct block_device *part, unsigned int inflight[2], bool mq_driver) { + int write = 0; + int read = 0; int cpu; if (mq_driver) { blk_mq_in_driver_rw(part, inflight); - } else { - for_each_possible_cpu(cpu) { - inflight[READ] += part_stat_local_read_cpu( - part, in_flight[READ], cpu); - inflight[WRITE] += part_stat_local_read_cpu( - part, in_flight[WRITE], cpu); - } + return; } - if (WARN_ON_ONCE((int)inflight[READ] < 0)) - inflight[READ] = 0; - if (WARN_ON_ONCE((int)inflight[WRITE] < 0)) - inflight[WRITE] = 0; + for_each_possible_cpu(cpu) { + read += part_stat_local_read_cpu(part, in_flight[READ], cpu); + write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu); + } + + /* + * While iterating all CPUs, some IOs may be issued from a CPU already + * traversed and complete on a CPU that has not yet been traversed, + * causing the inflight number to be negative. + */ + inflight[READ] = read > 0 ? read : 0; + inflight[WRITE] = write > 0 ? write : 0; } /** From 711741f94ac3cf9f4e3aa73aa171e76d188c0819 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 25 Jun 2025 12:22:38 -0300 Subject: [PATCH 284/297] smb: client: fix potential deadlock when reconnecting channels Fix cifs_signal_cifsd_for_reconnect() to take the correct lock order and prevent the following deadlock from happening ====================================================== WARNING: possible circular locking dependency detected 6.16.0-rc3-build2+ #1301 Tainted: G S W ------------------------------------------------------ cifsd/6055 is trying to acquire lock: ffff88810ad56038 (&tcp_ses->srv_lock){+.+.}-{3:3}, at: cifs_signal_cifsd_for_reconnect+0x134/0x200 but task is already holding lock: ffff888119c64330 (&ret_buf->chan_lock){+.+.}-{3:3}, at: cifs_signal_cifsd_for_reconnect+0xcf/0x200 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&ret_buf->chan_lock){+.+.}-{3:3}: validate_chain+0x1cf/0x270 __lock_acquire+0x60e/0x780 lock_acquire.part.0+0xb4/0x1f0 _raw_spin_lock+0x2f/0x40 cifs_setup_session+0x81/0x4b0 cifs_get_smb_ses+0x771/0x900 cifs_mount_get_session+0x7e/0x170 cifs_mount+0x92/0x2d0 cifs_smb3_do_mount+0x161/0x460 smb3_get_tree+0x55/0x90 vfs_get_tree+0x46/0x180 do_new_mount+0x1b0/0x2e0 path_mount+0x6ee/0x740 do_mount+0x98/0xe0 __do_sys_mount+0x148/0x180 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&ret_buf->ses_lock){+.+.}-{3:3}: validate_chain+0x1cf/0x270 __lock_acquire+0x60e/0x780 lock_acquire.part.0+0xb4/0x1f0 _raw_spin_lock+0x2f/0x40 cifs_match_super+0x101/0x320 sget+0xab/0x270 cifs_smb3_do_mount+0x1e0/0x460 smb3_get_tree+0x55/0x90 vfs_get_tree+0x46/0x180 do_new_mount+0x1b0/0x2e0 path_mount+0x6ee/0x740 do_mount+0x98/0xe0 __do_sys_mount+0x148/0x180 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (&tcp_ses->srv_lock){+.+.}-{3:3}: check_noncircular+0x95/0xc0 check_prev_add+0x115/0x2f0 validate_chain+0x1cf/0x270 __lock_acquire+0x60e/0x780 lock_acquire.part.0+0xb4/0x1f0 _raw_spin_lock+0x2f/0x40 cifs_signal_cifsd_for_reconnect+0x134/0x200 __cifs_reconnect+0x8f/0x500 cifs_handle_standard+0x112/0x280 cifs_demultiplex_thread+0x64d/0xbc0 kthread+0x2f7/0x310 ret_from_fork+0x2a/0x230 ret_from_fork_asm+0x1a/0x30 other info that might help us debug this: Chain exists of: &tcp_ses->srv_lock --> &ret_buf->ses_lock --> &ret_buf->chan_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ret_buf->chan_lock); lock(&ret_buf->ses_lock); lock(&ret_buf->chan_lock); lock(&tcp_ses->srv_lock); *** DEADLOCK *** 3 locks held by cifsd/6055: #0: ffffffff857de398 (&cifs_tcp_ses_lock){+.+.}-{3:3}, at: cifs_signal_cifsd_for_reconnect+0x7b/0x200 #1: ffff888119c64060 (&ret_buf->ses_lock){+.+.}-{3:3}, at: cifs_signal_cifsd_for_reconnect+0x9c/0x200 #2: ffff888119c64330 (&ret_buf->chan_lock){+.+.}-{3:3}, at: cifs_signal_cifsd_for_reconnect+0xcf/0x200 Cc: linux-cifs@vger.kernel.org Reported-by: David Howells Fixes: d7d7a66aacd6 ("cifs: avoid use of global locks for high contention data") Reviewed-by: David Howells Tested-by: David Howells Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: David Howells Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/connect.c | 58 +++++++++++++++++++++++++--------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 45e94e18f4d5..318a8405d475 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -709,6 +709,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index c48869c29e15..685c65dcb8c4 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -124,6 +124,14 @@ static void smb2_query_server_interfaces(struct work_struct *work) (SMB_INTERFACE_POLL_INTERVAL * HZ)); } +#define set_need_reco(server) \ +do { \ + spin_lock(&server->srv_lock); \ + if (server->tcpStatus != CifsExiting) \ + server->tcpStatus = CifsNeedReconnect; \ + spin_unlock(&server->srv_lock); \ +} while (0) + /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -137,39 +145,45 @@ void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels) { - struct TCP_Server_Info *pserver; + struct TCP_Server_Info *nserver; struct cifs_ses *ses; + LIST_HEAD(reco); int i; - /* If server is a channel, select the primary channel */ - pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; - /* if we need to signal just this channel */ if (!all_channels) { - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsExiting) - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&server->srv_lock); + set_need_reco(server); return; } - spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - if (cifs_ses_exiting(ses)) - continue; - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (!ses->chans[i].server) + if (SERVER_IS_CHAN(server)) + server = server->primary_server; + scoped_guard(spinlock, &cifs_tcp_ses_lock) { + set_need_reco(server); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); continue; - - spin_lock(&ses->chans[i].server->srv_lock); - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&ses->chans[i].server->srv_lock); + } + spin_lock(&ses->chan_lock); + for (i = 1; i < ses->chan_count; i++) { + nserver = ses->chans[i].server; + if (!nserver) + continue; + nserver->srv_count++; + list_add(&nserver->rlist, &reco); + } + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); } - spin_unlock(&ses->chan_lock); } - spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(server, nserver, &reco, rlist) { + list_del_init(&server->rlist); + set_need_reco(server); + cifs_put_tcp_session(server, 0); + } } /* From 43e7e284fc77b710d899569360ea46fa3374ae22 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Jun 2025 14:15:04 +0100 Subject: [PATCH 285/297] cifs: Fix the smbd_response slab to allow usercopy The handling of received data in the smbdirect client code involves using copy_to_iter() to copy data from the smbd_reponse struct's packet trailer to a folioq buffer provided by netfslib that encapsulates a chunk of pagecache. If, however, CONFIG_HARDENED_USERCOPY=y, this will result in the checks then performed in copy_to_iter() oopsing with something like the following: CIFS: Attempting to mount //172.31.9.1/test CIFS: VFS: RDMA transport established usercopy: Kernel memory exposure attempt detected from SLUB object 'smbd_response_0000000091e24ea1' (offset 81, size 63)! ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:102! ... RIP: 0010:usercopy_abort+0x6c/0x80 ... Call Trace: __check_heap_object+0xe3/0x120 __check_object_size+0x4dc/0x6d0 smbd_recv+0x77f/0xfe0 [cifs] cifs_readv_from_socket+0x276/0x8f0 [cifs] cifs_read_from_socket+0xcd/0x120 [cifs] cifs_demultiplex_thread+0x7e9/0x2d50 [cifs] kthread+0x396/0x830 ret_from_fork+0x2b8/0x3b0 ret_from_fork_asm+0x1a/0x30 The problem is that the smbd_response slab's packet field isn't marked as being permitted for usercopy. Fix this by passing parameters to kmem_slab_create() to indicate that copy_to_iter() is permitted from the packet region of the smbd_response slab objects, less the header space. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Stefan Metzmacher Link: https://lore.kernel.org/r/acb7f612-df26-4e2a-a35d-7cd040f513e1@samba.org/ Signed-off-by: David Howells Reviewed-by: Stefan Metzmacher Tested-by: Stefan Metzmacher cc: Paulo Alcantara cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index a976bcf61226..0a9fd6c399f6 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1475,6 +1475,9 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) char name[MAX_NAME_LEN]; int rc; + if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) + return -ENOMEM; + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( @@ -1492,12 +1495,17 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) goto out1; scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + + struct kmem_cache_args response_args = { + .align = __alignof__(struct smbd_response), + .useroffset = (offsetof(struct smbd_response, packet) + + sizeof(struct smbdirect_data_transfer)), + .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer), + }; info->response_cache = - kmem_cache_create( - name, - sizeof(struct smbd_response) + - sp->max_recv_size, - 0, SLAB_HWCACHE_ALIGN, NULL); + kmem_cache_create(name, + sizeof(struct smbd_response) + sp->max_recv_size, + &response_args, SLAB_HWCACHE_ALIGN); if (!info->response_cache) goto out2; From 263debecb4aa7cec0a86487e6f409814f6194a21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Apr 2025 20:27:26 +0100 Subject: [PATCH 286/297] cifs: Fix reading into an ITER_FOLIOQ from the smbdirect code When performing a file read from RDMA, smbd_recv() prints an "Invalid msg type 4" error and fails the I/O. This is due to the switch-statement there not handling the ITER_FOLIOQ handed down from netfslib. Fix this by collapsing smbd_recv_buf() and smbd_recv_page() into smbd_recv() and just using copy_to_iter() instead of memcpy(). This future-proofs the function too, in case more ITER_* types are added. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Stefan Metzmacher Signed-off-by: David Howells cc: Tom Talpey cc: Paulo Alcantara (Red Hat) cc: Matthew Wilcox cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 112 ++++++-------------------------------- 1 file changed, 17 insertions(+), 95 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 0a9fd6c399f6..754e94a0e07f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1778,35 +1778,39 @@ struct smbd_connection *smbd_get_connection( } /* - * Receive data from receive reassembly queue + * Receive data from the transport's receive reassembly queue * All the incoming data packets are placed in reassembly queue - * buf: the buffer to read data into + * iter: the buffer to read data into * size: the length of data to read * return value: actual data read - * Note: this implementation copies the data from reassebmly queue to receive + * + * Note: this implementation copies the data from reassembly queue to receive * buffers used by upper layer. This is not the optimal code path. A better way * to do it is to not have upper layer allocate its receive buffers but rather * borrow the buffer from reassembly queue, and return it after data is * consumed. But this will require more changes to upper layer code, and also * need to consider packet boundaries while they still being reassembled. */ -static int smbd_recv_buf(struct smbd_connection *info, char *buf, - unsigned int size) +int smbd_recv(struct smbd_connection *info, struct msghdr *msg) { struct smbdirect_socket *sc = &info->socket; struct smbd_response *response; struct smbdirect_data_transfer *data_transfer; + size_t size = iov_iter_count(&msg->msg_iter); int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; + if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE)) + return -EINVAL; /* It's a bug in upper layer to get there */ + again: /* * No need to hold the reassembly queue lock all the time as we are * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, + log_read(INFO, "size=%zd info->reassembly_data_length=%d\n", size, info->reassembly_data_length); if (info->reassembly_data_length >= size) { int queue_length; @@ -1844,7 +1848,10 @@ static int smbd_recv_buf(struct smbd_connection *info, char *buf, if (response->first_segment && size == 4) { unsigned int rfc1002_len = data_length + remaining_data_length; - *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); + if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), + &msg->msg_iter) != sizeof(rfc1002_hdr)) + return -EFAULT; data_read = 4; response->first_segment = false; log_read(INFO, "returning rfc1002 length %d\n", @@ -1853,10 +1860,9 @@ static int smbd_recv_buf(struct smbd_connection *info, char *buf, } to_copy = min_t(int, data_length - offset, to_read); - memcpy( - buf + data_read, - (char *)data_transfer + data_offset + offset, - to_copy); + if (copy_to_iter((char *)data_transfer + data_offset + offset, + to_copy, &msg->msg_iter) != to_copy) + return -EFAULT; /* move on to the next buffer? */ if (to_copy == data_length - offset) { @@ -1921,90 +1927,6 @@ static int smbd_recv_buf(struct smbd_connection *info, char *buf, goto again; } -/* - * Receive a page from receive reassembly queue - * page: the page to read data into - * to_read: the length of data to read - * return value: actual data read - */ -static int smbd_recv_page(struct smbd_connection *info, - struct page *page, unsigned int page_offset, - unsigned int to_read) -{ - struct smbdirect_socket *sc = &info->socket; - int ret; - char *to_address; - void *page_address; - - /* make sure we have the page ready for read */ - ret = wait_event_interruptible( - info->wait_reassembly_queue, - info->reassembly_data_length >= to_read || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (ret) - return ret; - - /* now we can read from reassembly queue and not sleep */ - page_address = kmap_atomic(page); - to_address = (char *) page_address + page_offset; - - log_read(INFO, "reading from page=%p address=%p to_read=%d\n", - page, to_address, to_read); - - ret = smbd_recv_buf(info, to_address, to_read); - kunmap_atomic(page_address); - - return ret; -} - -/* - * Receive data from transport - * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC - * return: total bytes read, or 0. SMB Direct will not do partial read. - */ -int smbd_recv(struct smbd_connection *info, struct msghdr *msg) -{ - char *buf; - struct page *page; - unsigned int to_read, page_offset; - int rc; - - if (iov_iter_rw(&msg->msg_iter) == WRITE) { - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg iter dir %u\n", - iov_iter_rw(&msg->msg_iter)); - rc = -EINVAL; - goto out; - } - - switch (iov_iter_type(&msg->msg_iter)) { - case ITER_KVEC: - buf = msg->msg_iter.kvec->iov_base; - to_read = msg->msg_iter.kvec->iov_len; - rc = smbd_recv_buf(info, buf, to_read); - break; - - case ITER_BVEC: - page = msg->msg_iter.bvec->bv_page; - page_offset = msg->msg_iter.bvec->bv_offset; - to_read = msg->msg_iter.bvec->bv_len; - rc = smbd_recv_page(info, page, page_offset, to_read); - break; - - default: - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg type %d\n", - iov_iter_type(&msg->msg_iter)); - rc = -EINVAL; - } - -out: - /* SMBDirect will read it all or nothing */ - if (rc > 0) - msg->msg_iter.count = 0; - return rc; -} - /* * Send data to transport * Each rqst is transported as a SMBDirect payload From 178b8ff66ff827c41b4fa105e9aabb99a0b5c537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2025 12:17:48 -0600 Subject: [PATCH 287/297] io_uring/kbuf: flag partial buffer mappings A previous commit aborted mapping more for a non-incremental ring for bundle peeking, but depending on where in the process this peeking happened, it would not necessarily prevent a retry by the user. That can create gaps in the received/read data. Add struct buf_sel_arg->partial_map, which can pass this information back. The networking side can then map that to internal state and use it to gate retry as well. Since this necessitates a new flag, change io_sr_msg->retry to a retry_flags member, and store both the retry and partial map condition in there. Cc: stable@vger.kernel.org Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 1 + io_uring/kbuf.h | 3 ++- io_uring/net.c | 23 +++++++++++++++-------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index ce95e3af44a9..f2d2cc319faa 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -271,6 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, if (len > arg->max_len) { len = arg->max_len; if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; if (iov != arg->iovs) break; buf->len = len; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 5d83c7adc739..723d0361898e 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -58,7 +58,8 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; - unsigned buf_group; + unsigned short buf_group; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/net.c b/io_uring/net.c index 5c1e8c4ba468..43a43522f406 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,12 +75,17 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - bool retry; + unsigned short retry_flags; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +enum sr_retry_flags { + IO_SR_MSG_RETRY = 1, + IO_SR_MSG_PARTIAL_MAP = 2, +}; + /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from @@ -187,7 +192,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = 0; /* get from the provided buffer */ } @@ -397,7 +402,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -751,7 +756,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -823,7 +828,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); - if (sr->retry) + if (sr->retry_flags & IO_SR_MSG_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) @@ -832,12 +837,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && + if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; sr->done_io += this_ret; - sr->retry = true; + sr->retry_flags |= IO_SR_MSG_RETRY; return false; } } else { @@ -1082,6 +1087,8 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } + if (arg.partial_map) + sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; /* special case 1 vec, can be a fast path */ if (ret == 1) { @@ -1276,7 +1283,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; zc->done_io = 0; - zc->retry = false; + zc->retry_flags = 0; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; From 9159c5e733cfa35ec863fa81960a3e7435f831fb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 27 Jun 2025 18:27:44 +0800 Subject: [PATCH 288/297] LoongArch: KVM: Add address alignment check for IOCSR emulation IOCSR instruction supports 1/2/4/8 bytes access, the address should be naturally aligned with its access size. Here address alignment check is added in the EIOINTC kernel emulation. Cc: stable@vger.kernel.org Fixes: 3956a52bc05b ("LoongArch: KVM: Add EIOINTC read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 9c47456b805c..236cbf979167 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -305,6 +305,11 @@ static int kvm_eiointc_read(struct kvm_vcpu *vcpu, return -EINVAL; } + if (addr & (len - 1)) { + kvm_err("%s: eiointc not aligned addr %llx len %d\n", __func__, addr, len); + return -EINVAL; + } + vcpu->kvm->stat.eiointc_read_exits++; spin_lock_irqsave(&eiointc->lock, flags); switch (len) { @@ -676,6 +681,11 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu, return -EINVAL; } + if (addr & (len - 1)) { + kvm_err("%s: eiointc not aligned addr %llx len %d\n", __func__, addr, len); + return -EINVAL; + } + vcpu->kvm->stat.eiointc_write_exits++; spin_lock_irqsave(&eiointc->lock, flags); switch (len) { From c34bbc2c990700ba07b271fc7c8113b0bc3e4093 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 27 Jun 2025 18:27:44 +0800 Subject: [PATCH 289/297] LoongArch: KVM: Fix interrupt route update with EIOINTC With function eiointc_update_sw_coremap(), there is forced assignment like val = *(u64 *)pvalue. Parameter pvalue may be pointer to char type or others, there is problem with forced assignment with u64 type. Here the detailed value is passed rather address pointer. Cc: stable@vger.kernel.org Fixes: 3956a52bc05b ("LoongArch: KVM: Add EIOINTC read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 236cbf979167..8d3f48e2a7f0 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -66,10 +66,9 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) } static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, - int irq, void *pvalue, u32 len, bool notify) + int irq, u64 val, u32 len, bool notify) { int i, cpu; - u64 val = *(u64 *)pvalue; for (i = 0; i < len; i++) { cpu = val & 0xff; @@ -403,7 +402,7 @@ static int loongarch_eiointc_writeb(struct kvm_vcpu *vcpu, irq = offset - EIOINTC_COREMAP_START; index = irq; s->coremap.reg_u8[index] = data; - eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + eiointc_update_sw_coremap(s, irq, data, sizeof(data), true); break; default: ret = -EINVAL; @@ -488,7 +487,7 @@ static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu, irq = offset - EIOINTC_COREMAP_START; index = irq >> 1; s->coremap.reg_u16[index] = data; - eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + eiointc_update_sw_coremap(s, irq, data, sizeof(data), true); break; default: ret = -EINVAL; @@ -573,7 +572,7 @@ static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu, irq = offset - EIOINTC_COREMAP_START; index = irq >> 2; s->coremap.reg_u32[index] = data; - eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + eiointc_update_sw_coremap(s, irq, data, sizeof(data), true); break; default: ret = -EINVAL; @@ -658,7 +657,7 @@ static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu, irq = offset - EIOINTC_COREMAP_START; index = irq >> 3; s->coremap.reg_u64[index] = data; - eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + eiointc_update_sw_coremap(s, irq, data, sizeof(data), true); break; default: ret = -EINVAL; @@ -816,7 +815,7 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, for (i = 0; i < (EIOINTC_IRQS / 4); i++) { start_irq = i * 4; eiointc_update_sw_coremap(s, start_irq, - (void *)&s->coremap.reg_u32[i], sizeof(u32), false); + s->coremap.reg_u32[i], sizeof(u32), false); } break; default: From 45515c643d0abb75c2cc760a6bc6b235eadafd66 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 27 Jun 2025 18:27:44 +0800 Subject: [PATCH 290/297] LoongArch: KVM: Check interrupt route from physical CPU With EIOINTC interrupt controller, physical CPU ID is set for irq route. However the function kvm_get_vcpu() is used to get destination vCPU when delivering irq. With API kvm_get_vcpu(), the logical CPU ID is used. With API kvm_get_vcpu_by_cpuid(), vCPU ID can be searched from physical CPU ID. Cc: stable@vger.kernel.org Fixes: 3956a52bc05b ("LoongArch: KVM: Add EIOINTC read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 8d3f48e2a7f0..644fb7785c07 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -9,7 +9,8 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s) { - int ipnum, cpu, irq_index, irq_mask, irq; + int ipnum, cpu, cpuid, irq_index, irq_mask, irq; + struct kvm_vcpu *vcpu; for (irq = 0; irq < EIOINTC_IRQS; irq++) { ipnum = s->ipmap.reg_u8[irq / 32]; @@ -20,7 +21,12 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s) irq_index = irq / 32; irq_mask = BIT(irq & 0x1f); - cpu = s->coremap.reg_u8[irq]; + cpuid = s->coremap.reg_u8[irq]; + vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid); + if (!vcpu) + continue; + + cpu = vcpu->vcpu_id; if (!!(s->coreisr.reg_u32[cpu][irq_index] & irq_mask)) set_bit(irq, s->sw_coreisr[cpu][ipnum]); else @@ -68,17 +74,23 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, int irq, u64 val, u32 len, bool notify) { - int i, cpu; + int i, cpu, cpuid; + struct kvm_vcpu *vcpu; for (i = 0; i < len; i++) { - cpu = val & 0xff; + cpuid = val & 0xff; val = val >> 8; if (!(s->status & BIT(EIOINTC_ENABLE_CPU_ENCODE))) { - cpu = ffs(cpu) - 1; - cpu = (cpu >= 4) ? 0 : cpu; + cpuid = ffs(cpuid) - 1; + cpuid = (cpuid >= 4) ? 0 : cpuid; } + vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid); + if (!vcpu) + continue; + + cpu = vcpu->vcpu_id; if (s->sw_coremap[irq + i] == cpu) continue; From cc8d5b209e09d3b52bca1ffe00045876842d96ae Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 27 Jun 2025 18:27:44 +0800 Subject: [PATCH 291/297] LoongArch: KVM: Check validity of "num_cpu" from user space The maximum supported cpu number is EIOINTC_ROUTE_MAX_VCPUS about irqchip EIOINTC, here add validation about cpu number to avoid array pointer overflow. Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 644fb7785c07..056a75f7d090 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -805,7 +805,7 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, int ret = 0; unsigned long flags; unsigned long type = (unsigned long)attr->attr; - u32 i, start_irq; + u32 i, start_irq, val; void __user *data; struct loongarch_eiointc *s = dev->kvm->arch.eiointc; @@ -813,8 +813,14 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: - if (copy_from_user(&s->num_cpu, data, 4)) + if (copy_from_user(&val, data, 4)) ret = -EFAULT; + else { + if (val >= EIOINTC_ROUTE_MAX_VCPUS) + ret = -EINVAL; + else + s->num_cpu = val; + } break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: if (copy_from_user(&s->features, data, 4)) @@ -842,7 +848,7 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { - int addr, cpuid, offset, ret = 0; + int addr, cpu, offset, ret = 0; unsigned long flags; void *p = NULL; void __user *data; @@ -850,7 +856,7 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, s = dev->kvm->arch.eiointc; addr = attr->attr; - cpuid = addr >> 16; + cpu = addr >> 16; addr &= 0xffff; data = (void __user *)attr->addr; switch (addr) { @@ -875,8 +881,11 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, p = &s->isr.reg_u32[offset]; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + if (cpu >= s->num_cpu) + return -EINVAL; + offset = (addr - EIOINTC_COREISR_START) / 4; - p = &s->coreisr.reg_u32[cpuid][offset]; + p = &s->coreisr.reg_u32[cpu][offset]; break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: offset = (addr - EIOINTC_COREMAP_START) / 4; From 955853cf83657faa58572ef3f08b44f0f88885c1 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 27 Jun 2025 18:27:44 +0800 Subject: [PATCH 292/297] LoongArch: KVM: Disable updating of "num_cpu" and "feature" Property "num_cpu" and "feature" are read-only once eiointc is created, which are set with KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL attr group before device creation. Attr group KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS is to update register and software state for migration and reset usage, property "num_cpu" and "feature" can not be update again if it is created already. Here discard write operation with property "num_cpu" and "feature" in attr group KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL. Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 056a75f7d090..a75f865d6fb9 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -926,9 +926,15 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, data = (void __user *)attr->addr; switch (addr) { case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU: + if (is_write) + return ret; + p = &s->num_cpu; break; case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE: + if (is_write) + return ret; + p = &s->features; break; case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE: From f40213cd93e608ee78b5e25db042c42ec07139fe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Jun 2025 09:56:52 +0200 Subject: [PATCH 293/297] i2c: scx200_acb: depends on HAS_IOPORT It already depends on X86_32, but that's also set for ARCH=um. Recent changes made UML no longer have IO port access since it's not needed, but this driver uses it. Build it only for HAS_IOPORT. This is pretty much the same as depending on X86, but on the off-chance that HAS_IOPORT will ever be optional on x86 HAS_IOPORT is the real prerequisite. Signed-off-by: Johannes Berg Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 48c5ab832009..0a4ecccd1851 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1530,7 +1530,7 @@ config I2C_XGENE_SLIMPRO config SCx200_ACB tristate "Geode ACCESS.bus support" - depends on X86_32 && PCI + depends on X86_32 && PCI && HAS_IOPORT help Enable the use of the ACCESS.bus controllers on the Geode SCx200 and SC1100 processors and the CS5535 and CS5536 Geode companion devices. From 6921d1e07cb5eddec830801087b419194fde0803 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 24 Jun 2025 14:38:46 +0800 Subject: [PATCH 294/297] tracing: Fix filter logic error If the processing of the tr->events loop fails, the filter that has been added to filter_head will be released twice in free_filter_list(&head->rcu) and __free_filter(filter). After adding the filter of tr->events, add the filter to the filter_head process to avoid triggering uaf. Link: https://lore.kernel.org/tencent_4EF87A626D702F816CD0951CE956EC32CD0A@qq.com Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization") Reported-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=daba72c4af9915e9c894 Tested-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 08141f105c95..3885aadc434d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1436,13 +1436,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, INIT_LIST_HEAD(&head->list); - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) - goto free_now; - - item->filter = filter; - list_add_tail(&item->list, &head->list); - list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; @@ -1454,6 +1447,13 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, event_clear_filter(file); } + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); return; free_now: From d0b3b7b22dfa1f4b515fd3a295b3fd958f9e81af Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Jun 2025 13:09:04 -0700 Subject: [PATCH 295/297] Linux 6.16-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f884dfe10246..1c9ea229809f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From e47a324d6f07c9ef252cfce1f14cfa5110cbed99 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 27 Jun 2025 18:40:04 -0500 Subject: [PATCH 296/297] dt-bindings: trigger-source: add ADI Util Sigma-Delta SPI Add new binding for the ADI Util Sigma-Delta SPI FPGA IP Core. This is used to trigger a SPI offload based on a RDY signal from the ADC while masking out other signals on the same line. Reviewed-by: Rob Herring (Arm) Signed-off-by: David Lechner Link: https://patch.msgid.link/20250627-iio-adc-ad7173-add-spi-offload-support-v2-8-f49c55599113@baylibre.com Signed-off-by: Mark Brown --- .../adi,util-sigma-delta-spi.yaml | 49 +++++++++++++++++++ MAINTAINERS | 5 ++ 2 files changed, 54 insertions(+) create mode 100644 Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml diff --git a/Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml b/Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml new file mode 100644 index 000000000000..ea466179551c --- /dev/null +++ b/Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (c) 2025 Analog Devices, Inc. +# Copyright (c) 2025 BayLibre, SAS + +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/trigger-source/adi,util-sigma-delta-spi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices Util Sigma-Delta SPI IP Core + +maintainers: + - David Lechner + +description: + The Util Sigma-Delta SPI is an FPGA IP core from Analog Devices that provides + a SPI offload trigger from the RDY signal of the combined DOUT/RDY pin of + the sigma-delta family of ADCs. + https://analogdevicesinc.github.io/hdl/library/util_sigma_delta_spi/index.html + +properties: + compatible: + const: adi,util-sigma-delta-spi + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + '#trigger-source-cells': + const: 0 + +required: + - compatible + - reg + - clocks + - '#trigger-source-cells' + +additionalProperties: false + +examples: + - | + trigger@40000 { + reg = <0x40000 0x1000>; + compatible = "adi,util-sigma-delta-spi"; + clocks = <&clk 0>; + #trigger-source-cells = <0>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 4bac4ea21b64..d7b5e8572279 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -25235,6 +25235,11 @@ W: https://github.com/srcres258/linux-doc T: git git://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/ +TRIGGER SOURCE - ADI UTIL SIGMA DELTA SPI +M: David Lechner +S: Maintained +F: Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml + TRIGGER SOURCE - PWM M: David Lechner S: Maintained From 3fcd3d2fe44dc9dfca20b6aed117f314a50ba0ff Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 27 Jun 2025 18:40:05 -0500 Subject: [PATCH 297/297] spi: offload trigger: add ADI Util Sigma-Delta SPI driver Add a new driver for the ADI Util Sigma-Delta SPI FPGA IP core. This is used to trigger a SPI offload based on a RDY signal from an ADC while masking out other signals on the same line. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250627-iio-adc-ad7173-add-spi-offload-support-v2-9-f49c55599113@baylibre.com Signed-off-by: Mark Brown --- MAINTAINERS | 2 +- drivers/spi/Kconfig | 5 ++ drivers/spi/Makefile | 1 + ...spi-offload-trigger-adi-util-sigma-delta.c | 59 +++++++++++++++++++ 4 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c diff --git a/MAINTAINERS b/MAINTAINERS index d7b5e8572279..d0d5b822a1f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23413,7 +23413,7 @@ F: include/linux/mtd/spi-nor.h SPI OFFLOAD R: David Lechner -F: drivers/spi/spi-offload-trigger-pwm.c +F: drivers/spi/spi-offload-trigger-*.c F: drivers/spi/spi-offload.c F: include/linux/spi/offload/ K: spi_offload diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index c51da3fc3604..e69f060d3875 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -1355,6 +1355,11 @@ if SPI_OFFLOAD comment "SPI Offload triggers" +config SPI_OFFLOAD_TRIGGER_ADI_UTIL_SD + tristate "SPI offload trigger using ADI sigma-delta utility" + help + SPI offload trigger from ADI sigma-delta utility FPGA IP block. + config SPI_OFFLOAD_TRIGGER_PWM tristate "SPI offload trigger using PWM" depends on PWM diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index 4ea89f6fc531..51f9f16ed734 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -170,3 +170,4 @@ obj-$(CONFIG_SPI_SLAVE_SYSTEM_CONTROL) += spi-slave-system-control.o # SPI offload triggers obj-$(CONFIG_SPI_OFFLOAD_TRIGGER_PWM) += spi-offload-trigger-pwm.o +obj-$(CONFIG_SPI_OFFLOAD_TRIGGER_ADI_UTIL_SD) += spi-offload-trigger-adi-util-sigma-delta.o diff --git a/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c new file mode 100644 index 000000000000..035d088d4d33 --- /dev/null +++ b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Analog Devices Inc. + * Copyright (C) 2025 BayLibre, SAS + */ + +#include +#include +#include +#include +#include +#include +#include + +static bool adi_util_sigma_delta_match(struct spi_offload_trigger *trigger, + enum spi_offload_trigger_type type, + u64 *args, u32 nargs) +{ + return type == SPI_OFFLOAD_TRIGGER_DATA_READY && nargs == 0; +} + +static const struct spi_offload_trigger_ops adi_util_sigma_delta_ops = { + .match = adi_util_sigma_delta_match, +}; + +static int adi_util_sigma_delta_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct spi_offload_trigger_info info = { + .fwnode = dev_fwnode(dev), + .ops = &adi_util_sigma_delta_ops, + }; + struct clk *clk; + + clk = devm_clk_get_enabled(dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(dev, PTR_ERR(clk), "Failed to get clock\n"); + + return devm_spi_offload_trigger_register(dev, &info); +} + +static const struct of_device_id adi_util_sigma_delta_of_match_table[] = { + { .compatible = "adi,util-sigma-delta-spi", }, + { } +}; +MODULE_DEVICE_TABLE(of, adi_util_sigma_delta_of_match_table); + +static struct platform_driver adi_util_sigma_delta_driver = { + .probe = adi_util_sigma_delta_probe, + .driver = { + .name = "adi-util-sigma-delta-spi", + .of_match_table = adi_util_sigma_delta_of_match_table, + }, +}; +module_platform_driver(adi_util_sigma_delta_driver); + +MODULE_AUTHOR("David Lechner "); +MODULE_DESCRIPTION("ADI Sigma-Delta SPI offload trigger utility driver"); +MODULE_LICENSE("GPL");