From 431e40042d3599559e588b8946bb28bd440b4f65 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 13 May 2026 12:29:21 -0600 Subject: [PATCH 1/5] bio-integrity-fs: pass data iter to bio_integrity_verify() bio_integrity_verify() expects the passed struct bvec_iter to be an iterator over bio data, not integrity. So construct a separate data bvec_iter without the bio_integrity_bytes() conversion and pass it to bio_integrity_verify() instead of bip_iter. Fixes: 0bde8a12b554 ("block: add fs_bio_integrity helpers") Signed-off-by: Caleb Sander Mateos Reviewed-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260513182924.1753582-1-csander@purestorage.com Signed-off-by: Jens Axboe --- block/bio-integrity-fs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity-fs.c b/block/bio-integrity-fs.c index acb1e5f270d2..0daa42d9ead7 100644 --- a/block/bio-integrity-fs.c +++ b/block/bio-integrity-fs.c @@ -55,6 +55,10 @@ int fs_bio_integrity_verify(struct bio *bio, sector_t sector, unsigned int size) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); + struct bvec_iter data_iter = { + .bi_sector = sector, + .bi_size = size, + }; /* * Reinitialize bip->bip_iter. @@ -65,7 +69,7 @@ int fs_bio_integrity_verify(struct bio *bio, sector_t sector, unsigned int size) memset(&bip->bip_iter, 0, sizeof(bip->bip_iter)); bip->bip_iter.bi_sector = sector; bip->bip_iter.bi_size = bio_integrity_bytes(bi, size >> SECTOR_SHIFT); - return blk_status_to_errno(bio_integrity_verify(bio, &bip->bip_iter)); + return blk_status_to_errno(bio_integrity_verify(bio, &data_iter)); } static int __init fs_bio_integrity_init(void) From 85686c72966c5ee637893f124ddb31a1cace7bee Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 19 May 2026 18:03:44 -0700 Subject: [PATCH 2/5] nvme-pci: fix dma_vecs leak on p2p memory We don't unmap P2P memory, so we don't need to track it. The dma_vec allocation was getting leaked on the completion. Fixes: b8b7570a7ec87 ("nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping") Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 139a10cd687f..f423f1718439 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -966,7 +966,8 @@ static bool nvme_pci_prp_save_mapping(struct request *req, { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - if (dma_use_iova(&iod->dma_state) || !dma_need_unmap(dma_dev)) + if (dma_use_iova(&iod->dma_state) || !dma_need_unmap(dma_dev) || + (iod->flags & IOD_DATA_P2P)) return true; if (!iod->nr_dma_vecs) { From 1bf86336e4b6cf40873fda47a7fe191446864937 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 19 May 2026 13:01:57 -0700 Subject: [PATCH 3/5] nvme-pci: fix dma mapping leak on data setup error We're leaking the initial DMA mapping during iteration if we fail to allocate the tracking descriptor for both PRP and SGL. Unmap the iterator directly; we can't use the existing unmap helper because it depends on the tracking descriptor being successfully allocated, so a new one for an in-use iterator is provided. The mappings were also leaking when the driver detects an invalid bio_vec when mapping PRPs, so fix that too. Fixes: b8b7570a7ec87 ("nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping") Fixes: 7ce3c1dd78fca ("nvme-pci: convert the data mapping to blk_rq_dma_map") Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f423f1718439..b5f846200678 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -997,6 +997,23 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, return nvme_pci_prp_save_mapping(req, dma_dev, iter); } +static void nvme_unmap_iter(struct request *req, struct blk_dma_iter *iter, + struct dma_iova_state *state) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dev = nvmeq->dev->dev; + + if (!blk_rq_dma_unmap(req, dev, state, iter->len, iter->p2pdma.map)) { + unsigned int attrs = 0; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + + dma_unmap_phys(dev, iter->addr, iter->len, rq_dma_dir(req), + attrs); + } +} + static blk_status_t nvme_pci_setup_data_prp(struct request *req, struct blk_dma_iter *iter) { @@ -1007,8 +1024,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, unsigned int prp_len, i; __le64 *prp_list; - if (!nvme_pci_prp_save_mapping(req, nvmeq->dev->dev, iter)) + if (!nvme_pci_prp_save_mapping(req, nvmeq->dev->dev, iter)) { + nvme_unmap_iter(req, iter, &iod->dma_state); return iter->status; + } /* * PRP1 always points to the start of the DMA transfers. @@ -1113,6 +1132,7 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, dev_err_once(nvmeq->dev->dev, "Incorrectly formed request for payload:%d nents:%d\n", blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); + nvme_unmap_data(req); return BLK_STS_IOERR; } @@ -1156,8 +1176,11 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, &sgl_dma); - if (!sg_list) + if (!sg_list) { + nvme_unmap_iter(req, iter, &iod->dma_state); return BLK_STS_RESOURCE; + } + iod->descriptors[iod->nr_descriptors++] = sg_list; do { @@ -1314,8 +1337,10 @@ static blk_status_t nvme_pci_setup_meta_iter(struct request *req) sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, &sgl_dma); - if (!sg_list) + if (!sg_list) { + nvme_unmap_iter(req, &iter, &iod->meta_dma_state); return BLK_STS_RESOURCE; + } iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; From dc278e9bf2b9513a763353e6b9cc21e0f532954e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 21 May 2026 12:02:53 -0700 Subject: [PATCH 4/5] blk-mq: pop cached request if it is usable When submitting a bio to blk-mq, if the task should sleep after peeking a cached request, but before it pops it, the plug flushes and calls blk_mq_free_plug_rqs, freeing the cached_rqs. This creates a use-after-free bug. Fix this by popping the cached request before any possible blocking calls if it is suitable for use. Popping this request first holds a queue reference, so avoid any serialization races with queue freezes and can safely proceed with dispatching that request to the driver. This potentially increases a timing window from when a driver wants to freeze its queue to when requests stop being dispatched. That scenario is off the fast path though, and drivers need to appropriately handle requests during a freeze request anyway. The downside is the popped element needs to be individually freed when we performed a bio plug merge. The cached request would have had to be freed later anyway, but this patch does it inline with building the plug list instead of after flushing it. Fixes: b0077e269f6c1 ("blk-mq: make sure active queue usage is held for bio_integrity_prep()") Fixes: 7b4f36cd22a65 ("block: ensure we hold a queue reference when using queue limits") Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260521190253.242065-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d0c37daf568f..28c2d931e75e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3077,7 +3077,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, /* * Check if there is a suitable cached request and return it. */ -static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, +static struct request *blk_mq_get_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf) { enum hctx_type type = blk_mq_get_hctx_type(opf); @@ -3093,27 +3093,10 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; + rq_list_pop(&plug->cached_rqs); return rq; } -static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, - struct bio *bio) -{ - if (rq_list_pop(&plug->cached_rqs) != rq) - WARN_ON_ONCE(1); - - /* - * If any qos ->throttle() end up blocking, we will have flushed the - * plug and hence killed the cached_rq list as well. Pop this entry - * before we throttle. - */ - rq_qos_throttle(rq->q, bio); - - blk_mq_rq_time_init(rq, blk_time_get_ns()); - rq->cmd_flags = bio->bi_opf; - INIT_LIST_HEAD(&rq->queuelist); -} - static bool bio_unaligned(const struct bio *bio, struct request_queue *q) { unsigned int bs_mask = queue_logical_block_size(q) - 1; @@ -3152,7 +3135,7 @@ void blk_mq_submit_bio(struct bio *bio) /* * If the plug has a cached request for this queue, try to use it. */ - rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + rq = blk_mq_get_cached_request(plug, q, bio->bi_opf); /* * A BIO that was released from a zone write plug has already been @@ -3211,7 +3194,10 @@ void blk_mq_submit_bio(struct bio *bio) new_request: if (rq) { - blk_mq_use_cached_rq(rq, plug, bio); + rq_qos_throttle(rq->q, bio); + blk_mq_rq_time_init(rq, blk_time_get_ns()); + rq->cmd_flags = bio->bi_opf; + INIT_LIST_HEAD(&rq->queuelist); } else { rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) { @@ -3257,12 +3243,10 @@ void blk_mq_submit_bio(struct bio *bio) return; queue_exit: - /* - * Don't drop the queue reference if we were trying to use a cached - * request and thus didn't acquire one. - */ if (!rq) blk_queue_exit(q); + else + blk_mq_free_request(rq); } #ifdef CONFIG_BLK_MQ_STACKING From f6982769910ecddabdb5b8b9afdab0bb8b6668ac Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 22 May 2026 20:56:22 +0900 Subject: [PATCH 5/5] block: avoid use-after-free in disk_free_zone_resources() The function disk_update_zone_resources() may call disk_free_zone_resources() in case of error, and following this, blk_revalidate_disk_zones() will again calls disk_free_zone_resources() if disk_update_zone_resources() failed. If a zone worker thread is being used (which is the default for a rotational media zoned device), disk_free_zone_resources() will try to stop the zone worker thread twice because disk->zone_wplugs_worker is not reset to NULL when the worker thread is stopped the first time. In disk_free_zone_resources(), fix this by correctly clearing disk->zone_wplugs_worker to NULL when the worker thread is stopped. And while at it, since disk_free_zone_resources() is always called after a failed call to disk_update_zone_resources(), remove the unnecessary call to disk_free_zone_resources() in disk_update_zone_resources(). Fixes: 1365b6904fd0 ("block: allow submitting all zone writes from a single context") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260522115622.588535-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 42ef830054dc..6a221c180889 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -2001,8 +2001,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { - if (disk->zone_wplugs_worker) + if (disk->zone_wplugs_worker) { kthread_stop(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + } WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); if (disk->zone_wplugs_wq) { @@ -2135,9 +2137,6 @@ static int disk_update_zone_resources(struct gendisk *disk, ret = queue_limits_commit_update(q, &lim); unfreeze: - if (ret) - disk_free_zone_resources(disk); - blk_mq_unfreeze_queue(q, memflags); return ret;