From c3e6c11147f6f05c15e9c2d74f5d234a6661013c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:26 +0800 Subject: [PATCH 1/6] loop: add helper lo_cmd_nr_bvec() Add lo_cmd_nr_bvec() and prepare for refactoring lo_rw_aio(). Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 13ce229d450c..c6c37c9df193 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -337,6 +337,19 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret) lo_rw_aio_do_completion(cmd); } +static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct req_iterator rq_iter; + struct bio_vec tmp; + int nr_bvec = 0; + + rq_for_each_bvec(tmp, rq, rq_iter) + nr_bvec++; + + return nr_bvec; +} + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { @@ -348,12 +361,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; - int nr_bvec = 0; + int nr_bvec = lo_cmd_nr_bvec(cmd); int ret; - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; - if (rq->bio != rq->biotail) { bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), From fd858d1ca9694c88703a8a936d5c7596c86ada74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:27 +0800 Subject: [PATCH 2/6] loop: add helper lo_rw_aio_prep() Add helper lo_rw_aio_prep() to separate the preparation phase(setting up bio vectors and initializing the iocb structure) from the actual I/O execution in the loop block driver. Prepare for using NOWAIT to improve loop performance. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 63 ++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c6c37c9df193..c0d8d290cb78 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -350,21 +350,15 @@ static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) return nr_bvec; } -static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, int rw) +static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, + unsigned nr_bvec, loff_t pos) { - struct iov_iter iter; - struct req_iterator rq_iter; - struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); - struct bio *bio = rq->bio; - struct file *file = lo->lo_backing_file; - struct bio_vec tmp; - unsigned int offset; - int nr_bvec = lo_cmd_nr_bvec(cmd); - int ret; if (rq->bio != rq->biotail) { + struct req_iterator rq_iter; + struct bio_vec *bvec; + struct bio_vec tmp; bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); @@ -382,8 +376,42 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, *bvec = tmp; bvec++; } - bvec = cmd->bvec; + } else { + cmd->bvec = NULL; + } + + cmd->iocb.ki_pos = pos; + cmd->iocb.ki_filp = lo->lo_backing_file; + cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } + return 0; +} + +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, int rw) +{ + struct iov_iter iter; + struct bio_vec *bvec; + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct bio *bio = rq->bio; + struct file *file = lo->lo_backing_file; + unsigned int offset; + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret; + + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + return ret; + + if (cmd->bvec) { offset = 0; + bvec = cmd->bvec; } else { /* * Same here, this bio may be started from the middle of the @@ -398,17 +426,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; - cmd->iocb.ki_pos = pos; - cmd->iocb.ki_filp = file; - cmd->iocb.ki_ioprio = req_get_ioprio(rq); - if (cmd->use_aio) { - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; - } else { - cmd->iocb.ki_complete = NULL; - cmd->iocb.ki_flags = 0; - } - if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); ret = file->f_op->write_iter(&cmd->iocb, &iter); From c66e9708f92760147a1ea7f66c7b60ec801f85e3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:28 +0800 Subject: [PATCH 3/6] loop: add lo_submit_rw_aio() Refactor lo_rw_aio() by extracting the I/O submission logic into a new helper function lo_submit_rw_aio(). This further improves code organization by separating the I/O preparation, submission, and completion handling into distinct phases. Prepare for using NOWAIT to improve loop performance. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c0d8d290cb78..a494a93ed2b1 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -393,38 +393,32 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, return 0; } -static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, int rw) +static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + int nr_bvec, int rw) { - struct iov_iter iter; - struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); - struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; - unsigned int offset; - int nr_bvec = lo_cmd_nr_bvec(cmd); + struct iov_iter iter; int ret; - ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - if (unlikely(ret)) - return ret; - if (cmd->bvec) { - offset = 0; - bvec = cmd->bvec; + iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = 0; } else { + struct bio *bio = rq->bio; + struct bio_vec *bvec = __bvec_iter_bvec(bio->bi_io_vec, + bio->bi_iter); + /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ - offset = bio->bi_iter.bi_bvec_done; - bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = bio->bi_iter.bi_bvec_done; } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); - iter.iov_offset = offset; if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); @@ -433,7 +427,20 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); + return ret; +} +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, int rw) +{ + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret; + + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + return ret; + + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; From f4788ae9d7bc01735cb6ada333b038c2e3fff260 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:29 +0800 Subject: [PATCH 4/6] loop: move command blkcg/memcg initialization into loop_queue_work Move loop command blkcg/memcg initialization into loop_queue_work, and prepare for supporting to handle loop io command by IOCB_NOWAIT. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a494a93ed2b1..c45635e48164 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -831,11 +831,28 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { + struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; + /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); +#ifdef CONFIG_MEMCG + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } +#endif + } +#endif + spin_lock_irq(&lo->lo_work_lock); if (queue_on_root_worker(cmd->blkcg_css)) @@ -1907,21 +1924,6 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } - /* always use the first bio's css */ - cmd->blkcg_css = NULL; - cmd->memcg_css = NULL; -#ifdef CONFIG_BLK_CGROUP - if (rq->bio) { - cmd->blkcg_css = bio_blkcg_css(rq->bio); -#ifdef CONFIG_MEMCG - if (cmd->blkcg_css) { - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); - } -#endif - } -#endif loop_queue_work(lo, cmd); return BLK_STS_OK; From 0ba93a906dda7ede9e7669adefe005ee18f3ff42 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:30 +0800 Subject: [PATCH 5/6] loop: try to handle loop aio command via NOWAIT IO first Try to handle loop aio command via NOWAIT IO first, then we can avoid to queue the aio command into workqueue. This is usually one big win in case that FS block mapping is stable, Mikulas verified [1] that this way improves IO perf by close to 5X in 12jobs sequential read/write test, in which FS block mapping is just stable. Fallback to workqueue in case of -EAGAIN. This way may bring a little cost from the 1st retry, but when running the following write test over loop/sparse_file, the actual effect on randwrite is obvious: ``` truncate -s 4G 1.img #1.img is created on XFS/virtio-scsi losetup -f 1.img --direct-io=on fio --direct=1 --bs=4k --runtime=40 --time_based --numjobs=1 --ioengine=libaio \ --iodepth=16 --group_reporting=1 --filename=/dev/loop0 -name=job --rw=$RW ``` - RW=randwrite: obvious IOPS drop observed - RW=write: a little drop(%5 - 10%) This perf drop on randwrite over sparse file will be addressed in the following patch. BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or .write_iter() which might sleep even though it is NOWAIT, and the only effect is that rcu read lock is replaced with srcu read lock. Link: https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/ [1] Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 68 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c45635e48164..94478c02fea6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -90,6 +90,8 @@ struct loop_cmd { #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd); + static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); @@ -321,6 +323,15 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) if (!atomic_dec_and_test(&cmd->ref)) return; + + /* -EAGAIN could be returned from bdev's ->ki_complete */ + if (cmd->ret == -EAGAIN) { + struct loop_device *lo = rq->q->queuedata; + + loop_queue_work(lo, cmd); + return; + } + kfree(cmd->bvec); cmd->bvec = NULL; if (req_op(rq) == REQ_OP_WRITE) @@ -430,22 +441,51 @@ static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, return ret; } +static bool lo_backfile_support_nowait(const struct loop_device *lo) +{ + return lo->lo_backing_file->f_mode & FMODE_NOWAIT; +} + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { int nr_bvec = lo_cmd_nr_bvec(cmd); int ret; - ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - if (unlikely(ret)) - return ret; + /* prepared already if we have tried nowait */ + if (!cmd->use_aio || !lo_backfile_support_nowait(lo)) { + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + goto fail; + } + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); +fail: if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; } +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, + int rw) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + + if (unlikely(ret)) + goto fail; + + cmd->iocb.ki_flags |= IOCB_NOWAIT; + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); +fail: + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + lo_rw_aio_complete(&cmd->iocb, ret); + return ret; +} + static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -1907,6 +1947,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct loop_device *lo = rq->q->queuedata; + int rw = 0; blk_mq_start_request(rq); @@ -1919,9 +1960,25 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_WRITE_ZEROES: cmd->use_aio = false; break; - default: + case REQ_OP_READ: + rw = ITER_DEST; cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; + case REQ_OP_WRITE: + rw = ITER_SOURCE; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; + break; + default: + return BLK_STS_IOERR; + } + + /* try NOWAIT if the backing file supports the mode */ + if (cmd->use_aio && lo_backfile_support_nowait(lo)) { + int res = lo_rw_aio_nowait(lo, cmd, rw); + + if (res != -EAGAIN && res != -EOPNOTSUPP) + return BLK_STS_OK; + /* fallback to workqueue for handling aio */ } loop_queue_work(lo, cmd); @@ -2073,7 +2130,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT | + BLK_MQ_F_BLOCKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); From 837ed303964673cf0c7e6a4624cd68d8cf254827 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:31 +0800 Subject: [PATCH 6/6] loop: add hint for handling aio via IOCB_NOWAIT Add hint for using IOCB_NOWAIT to handle loop aio command for avoiding to cause write(especially randwrite) perf regression on sparse backed file. Try IOCB_NOWAIT in the following situations: - backing file is block device OR - READ aio command OR - there isn't any queued blocking async WRITEs, because NOWAIT won't cause contention with blocking WRITE, which often implies exclusive lock With this simple policy, perf regression of randwrite/write on sparse backing file is fixed. Link: https://lore.kernel.org/dm-devel/7d6ae2c9-df8e-50d0-7ad6-b787cb3cfab4@redhat.com/ Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 94478c02fea6..9b842d767381 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -68,6 +68,7 @@ struct loop_device { struct rb_root worker_tree; struct timer_list timer; bool sysfs_inited; + unsigned lo_nr_blocking_writes; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; @@ -467,6 +468,33 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, return -EIOCBQUEUED; } +static inline bool lo_aio_try_nowait(struct loop_device *lo, + struct loop_cmd *cmd) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct request *rq = blk_mq_rq_from_pdu(cmd); + + /* NOWAIT works fine for backing block device */ + if (S_ISBLK(inode->i_mode)) + return true; + + /* + * NOWAIT is supposed to be fine for READ without contending with + * blocking WRITE + */ + if (req_op(rq) == REQ_OP_READ) + return true; + + /* + * If there is any queued non-NOWAIT async WRITE , don't try new + * NOWAIT WRITE for avoiding contention + * + * Here we focus on handling stable FS block mapping via NOWAIT + */ + return READ_ONCE(lo->lo_nr_blocking_writes) == 0; +} + static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, int rw) { @@ -478,6 +506,9 @@ static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, if (unlikely(ret)) goto fail; + if (!lo_aio_try_nowait(lo, cmd)) + return -EAGAIN; + cmd->iocb.ki_flags |= IOCB_NOWAIT; ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); fail: @@ -780,12 +811,19 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } +static ssize_t loop_attr_nr_blocking_writes_show(struct loop_device *lo, + char *buf) +{ + return sysfs_emit(buf, "%u\n", lo->lo_nr_blocking_writes); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); +LOOP_ATTR_RO(nr_blocking_writes); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, @@ -794,6 +832,7 @@ static struct attribute *loop_attrs[] = { &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, + &loop_attr_nr_blocking_writes.attr, NULL, }; @@ -869,6 +908,24 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) } #endif +static inline void loop_inc_blocking_writes(struct loop_device *lo, + struct loop_cmd *cmd) +{ + lockdep_assert_held(&lo->lo_work_lock); + + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) + lo->lo_nr_blocking_writes += 1; +} + +static inline void loop_dec_blocking_writes(struct loop_device *lo, + struct loop_cmd *cmd) +{ + lockdep_assert_held(&lo->lo_work_lock); + + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) + lo->lo_nr_blocking_writes -= 1; +} + static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); @@ -951,6 +1008,8 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) work = &lo->rootcg_work; cmd_list = &lo->rootcg_cmd_list; } + if (cmd->use_aio) + loop_inc_blocking_writes(lo, cmd); list_add_tail(&cmd->list_entry, cmd_list); queue_work(lo->workqueue, work); spin_unlock_irq(&lo->lo_work_lock); @@ -2052,6 +2111,8 @@ static void loop_process_work(struct loop_worker *worker, cond_resched(); spin_lock_irq(&lo->lo_work_lock); + if (cmd->use_aio) + loop_dec_blocking_writes(lo, cmd); } /*