From 47c1260ea822d44117cb091ded290719c1ea05e1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 3 Mar 2021 10:37:27 +0100 Subject: [PATCH] ANDROID: Revert "Merge 582cd91f69de ("Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into android-mainline" This reverts commit f858d3a02f88 ("Merge 582cd91f69de ("Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into android-mainline") as it is causing boot problems with md devices in the android build systems. Reverting this should allow us to continue on with the merges of other non-block issues, in order to be able to focus on tracking this issue down at a later time. Fixes: f858d3a02f88 ("Merge 582cd91f69de ("Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into android-mainline") Bug: 181742070 Cc: Eric Biggers Signed-off-by: Greg Kroah-Hartman Change-Id: I03cc2f86827f2d18db88fc68a4128747430f40d4 --- Documentation/block/biovecs.rst | 2 - Documentation/block/queue-sysfs.rst | 13 - Documentation/filesystems/f2fs.rst | 1 + Documentation/filesystems/porting.rst | 16 - block/bfq-iosched.c | 445 ++++++++------------- block/bfq-iosched.h | 29 +- block/bfq-wf2q.c | 3 + block/bio-integrity.c | 17 +- block/bio.c | 554 ++++++++++++++------------ block/blk-cgroup.c | 15 +- block/blk-core.c | 43 +- block/blk-crypto-fallback.c | 4 +- block/blk-exec.c | 14 +- block/blk-flush.c | 17 +- block/blk-mq.c | 67 +--- block/blk-settings.c | 41 +- block/blk-sysfs.c | 8 - block/blk-wbt.c | 4 +- block/blk-zoned.c | 17 - block/blk.h | 10 +- block/bounce.c | 2 - block/bsg.c | 6 +- block/genhd.c | 275 +++++++++++-- block/kyber-iosched.c | 1 - block/mq-deadline.c | 6 + block/partitions/core.c | 33 +- block/scsi_ioctl.c | 6 +- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/block/drbd/drbd_int.h | 2 + drivers/block/drbd/drbd_main.c | 13 + drivers/block/drbd/drbd_req.c | 5 +- drivers/block/drbd/drbd_req.h | 12 + drivers/block/drbd/drbd_worker.c | 5 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/null_blk/zoned.c | 8 +- drivers/block/paride/pd.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/sx8.c | 4 +- drivers/block/virtio_blk.c | 2 +- drivers/cdrom/cdrom.c | 2 +- drivers/ide/ide-atapi.c | 2 +- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-devsets.c | 2 +- drivers/ide/ide-disk.c | 2 +- drivers/ide/ide-ioctls.c | 4 +- drivers/ide/ide-park.c | 2 +- drivers/ide/ide-pm.c | 4 +- drivers/ide/ide-tape.c | 2 +- drivers/ide/ide-taskfile.c | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/request.c | 34 +- drivers/md/bcache/super.c | 2 +- drivers/md/dm-clone-target.c | 14 +- drivers/md/dm-zoned-metadata.c | 6 +- drivers/md/md.c | 71 ++-- drivers/md/md.h | 2 + drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 6 +- drivers/md/raid5-ppl.c | 2 +- drivers/md/raid5.c | 108 ++--- drivers/mmc/core/block.c | 10 +- drivers/nvme/host/core.c | 22 +- drivers/nvme/host/lightnvm.c | 6 +- drivers/nvme/host/multipath.c | 4 +- drivers/nvme/host/pci.c | 4 +- drivers/nvme/host/zns.c | 11 +- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/nvme/target/passthru.c | 2 +- drivers/s390/block/dasd.c | 26 +- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- drivers/scsi/sd_zbc.c | 43 +- drivers/scsi/sg.c | 3 +- drivers/scsi/st.c | 2 +- drivers/target/target_core_file.c | 20 +- drivers/target/target_core_pscsi.c | 3 +- fs/block_dev.c | 20 +- fs/btrfs/volumes.c | 2 +- fs/direct-io.c | 2 - fs/exfat/file.c | 2 +- fs/ext4/fast_commit.c | 4 +- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/data.c | 28 +- fs/f2fs/f2fs.h | 2 + fs/f2fs/segment.c | 12 +- fs/f2fs/super.c | 1 + fs/fat/file.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hfsplus/super.c | 2 +- fs/iomap/direct-io.c | 9 +- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 4 +- fs/jbd2/recovery.c | 2 +- fs/libfs.c | 2 +- fs/nfs/blocklayout/blocklayout.c | 5 + fs/nfsd/blocklayout.c | 2 +- fs/nilfs2/segbuf.c | 4 + fs/nilfs2/the_nilfs.h | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/file.c | 2 +- fs/splice.c | 9 +- fs/xfs/xfs_super.c | 2 +- fs/zonefs/super.c | 13 +- include/linux/bio.h | 37 +- include/linux/blk-mq.h | 4 + include/linux/blk_types.h | 30 +- include/linux/blkdev.h | 54 +-- include/linux/elevator.h | 2 - include/linux/genhd.h | 21 +- include/linux/swap.h | 1 + lib/iov_iter.c | 21 +- mm/page_io.c | 45 ++- mm/swapfile.c | 36 +- 117 files changed, 1370 insertions(+), 1173 deletions(-) diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index ddb867e0185b..36771a131b56 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -40,8 +40,6 @@ normal code doesn't have to deal with bi_bvec_done. There is a lower level advance function - bvec_iter_advance() - which takes a pointer to a biovec, not a bio; this is used by the bio integrity code. -As of 5.12 bvec segments with zero bv_len are not supported. - What's all this get us? ======================= diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 4dc7f0d499a8..2638d3446b79 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -261,12 +261,6 @@ For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES is not supported. -zone_append_max_bytes (RO) --------------------------- -This is the maximum number of bytes that can be written to a sequential -zone of a zoned block device using a zone append write operation -(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices. - zoned (RO) ---------- This indicates if the device is a zoned block device and the zone model of the @@ -279,11 +273,4 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC do not support zone commands, they will be treated as regular block devices and zoned will report "none". -zone_write_granularity (RO) ---------------------------- -This indicates the alignment constraint, in bytes, for write operations in -sequential zones of zoned block devices (devices with a zoned attributed -that reports "host-managed" or "host-aware"). This value is always 0 for -regular block devices. - Jens Axboe , February 2009 diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 35ed01a5fbc9..81c05baa8312 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,6 +179,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 1f8cf8e10b34..867036aa90b8 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -865,19 +865,3 @@ no matter what. Everything is handled by the caller. clone_private_mount() returns a longterm mount now, so the proper destructor of its result is kern_unmount() or kern_unmount_array(). - ---- - -**mandatory** - -zero-length bvec segments are disallowed, they must be filtered out before -passed on to an iterator. - ---- - -**mandatory** - -For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but -uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and -page references stay until I/O has completed, i.e. until ->ki_complete() has -been called or returned with non -EIOCBQUEUED code. diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b398dde53af9..9e81d1052091 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -158,6 +158,7 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1023,16 +1024,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); - bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; - bfqq->inject_limit = bic->saved_inject_limit; - bfqq->decrease_time_jif = bic->saved_decrease_time_jif; - bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; - bfqq->io_start_time = bic->saved_io_start_time; - bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; @@ -1653,8 +1647,6 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } -static bool bfq_better_to_idle(struct bfq_queue *bfqq); - static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1679,19 +1671,15 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense), - * - has a default weight (otherwise we assume the user wanted - * to control its weight explicitly) + * - is linked to a bfq_io_cq (it is not shared in any sense). */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0 && - bfqq->entity.new_weight == 40; - *interactive = !in_burst && idle_for_long_time && - bfqq->entity.new_weight == 40; + bfqq->dispatched == 0; + *interactive = !in_burst && idle_for_long_time; wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && @@ -1729,6 +1717,17 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1756,10 +1755,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_add_bfqq_busy(bfqd, bfqq); /* - * Expire in-service queue if preemption may be needed for - * guarantees or throughput. As for guarantees, we care - * explicitly about two cases. The first is that bfqq has to - * recover a service hole, as explained in the comments on + * Expire in-service queue only if preemption may be needed + * for guarantees. In particular, we care only about two + * cases. The first is that bfqq has to recover a service + * hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1786,23 +1785,11 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). - * - * As for throughput, we ask bfq_better_to_idle() whether we - * still need to plug I/O dispatching. If bfq_better_to_idle() - * says no, then plugging is not needed any longer, either to - * boost throughput or to perserve service guarantees. Then - * the best option is to stop plugging I/O, as not doing so - * would certainly lower throughput. We may end up in this - * case if: (1) upon a dispatch attempt, we detected that it - * was better to plug I/O dispatch, and to wait for a new - * request to arrive for the currently in-service queue, but - * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || - !bfq_better_to_idle(bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); @@ -1874,138 +1861,6 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } -static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) -{ - u64 tot_io_time = now_ns - bfqq->io_start_time; - - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) - bfqq->tot_idle_time += - now_ns - bfqq->ttime.last_end_request; - - if (unlikely(bfq_bfqq_just_created(bfqq))) - return; - - /* - * Must be busy for at least about 80% of the time to be - * considered I/O bound. - */ - if (bfqq->tot_idle_time * 5 > tot_io_time) - bfq_clear_bfqq_IO_bound(bfqq); - else - bfq_mark_bfqq_IO_bound(bfqq); - - /* - * Keep an observation window of at most 200 ms in the past - * from now. - */ - if (tot_io_time > 200 * NSEC_PER_MSEC) { - bfqq->io_start_time = now_ns - (tot_io_time>>1); - bfqq->tot_idle_time >>= 1; - } -} - -/* - * Detect whether bfqq's I/O seems synchronized with that of some - * other queue, i.e., whether bfqq, after remaining empty, happens to - * receive new I/O only right after some I/O request of the other - * queue has been completed. We call waker queue the other queue, and - * we assume, for simplicity, that bfqq may have at most one waker - * queue. - * - * A remarkable throughput boost can be reached by unconditionally - * injecting the I/O of the waker queue, every time a new - * bfq_dispatch_request happens to be invoked while I/O is being - * plugged for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth and latency for - * bfqq. Note that these same results may be achieved with the general - * injection mechanism, but less effectively. For details on this - * aspect, see the comments on the choice of the queue for injection - * in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a queue Q is deemed - * as a waker queue for bfqq if, for three consecutive times, bfqq - * happens to become non empty right after a request of Q has been - * completed. In particular, on the first time, Q is tentatively set - * as a candidate waker queue, while on the third consecutive time - * that Q is detected, the field waker_bfqq is set to Q, to confirm - * that Q is a waker queue for bfqq. These detection steps are - * performed only if bfqq has a long think time, so as to make it more - * likely that bfqq's I/O is actually being blocked by a - * synchronization. This last filter, plus the above three-times - * requirement, make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner throughput can be - * boosted by injecting I/O from the waker queue. Fortunately, - * detection is likely to be actually fast, for the following - * reasons. While blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at least equal to 1 - * (see the comments in bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served during the very - * first I/O-plugging time interval for bfqq. This triggers the first - * step of the detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be confirmed no later than - * during the next I/O-plugging interval for bfqq. - * - * ISSUE - * - * On queue merging all waker information is lost. - */ -static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, - u64 now_ns) -{ - if (!bfqd->last_completed_rq_bfqq || - bfqd->last_completed_rq_bfqq == bfqq || - bfq_bfqq_has_short_ttime(bfqq) || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || - bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) - return; - - if (bfqd->last_completed_rq_bfqq != - bfqq->tentative_waker_bfqq) { - /* - * First synchronization detected with a - * candidate waker queue, or with a different - * candidate waker queue from the current one. - */ - bfqq->tentative_waker_bfqq = - bfqd->last_completed_rq_bfqq; - bfqq->num_waker_detections = 1; - } else /* Same tentative waker queue detected again */ - bfqq->num_waker_detections++; - - if (bfqq->num_waker_detections == 3) { - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - bfqq->tentative_waker_bfqq = NULL; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - } -} - static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -2013,14 +1868,117 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - bfq_check_waker(bfqd, bfqq, now_ns); + /* + * Detect whether bfqq's I/O seems synchronized with + * that of some other queue, i.e., whether bfqq, after + * remaining empty, happens to receive new I/O only + * right after some I/O request of the other queue has + * been completed. We call waker queue the other + * queue, and we assume, for simplicity, that bfqq may + * have at most one waker queue. + * + * A remarkable throughput boost can be reached by + * unconditionally injecting the I/O of the waker + * queue, every time a new bfq_dispatch_request + * happens to be invoked while I/O is being plugged + * for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth + * and latency for bfqq. Note that these same results + * may be achieved with the general injection + * mechanism, but less effectively. For details on + * this aspect, see the comments on the choice of the + * queue for injection in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a + * queue Q is deemed as a waker queue for bfqq if, for + * two consecutive times, bfqq happens to become non + * empty right after a request of Q has been + * completed. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while + * on the second time, the flag + * bfq_bfqq_has_waker(bfqq) is set to confirm that Q + * is a waker queue for bfqq. These detection steps + * are performed only if bfqq has a long think time, + * so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This + * last filter, plus the above two-times requirement, + * make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner + * throughput can be boosted by injecting I/O from the + * waker queue. Fortunately, detection is likely to be + * actually fast, for the following reasons. While + * blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at + * least equal to 1 (see the comments in + * bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served + * during the very first I/O-plugging time interval + * for bfqq. This triggers the first step of the + * detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be + * confirmed no later than during the next + * I/O-plugging interval for bfqq. + */ + if (bfqd->last_completed_rq_bfqq && + !bfq_bfqq_has_short_ttime(bfqq) && + ktime_get_ns() - bfqd->last_completion < + 200 * NSEC_PER_USEC) { + if (bfqd->last_completed_rq_bfqq != bfqq && + bfqd->last_completed_rq_bfqq != + bfqq->waker_bfqq) { + /* + * First synchronization detected with + * a candidate waker queue, or with a + * different candidate waker queue + * from the current one. + */ + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + + bfq_clear_bfqq_has_waker(bfqq); + } else if (bfqd->last_completed_rq_bfqq == + bfqq->waker_bfqq && + !bfq_bfqq_has_waker(bfqq)) { + /* + * synchronization with waker_bfqq + * seen for the second time + */ + bfq_mark_bfqq_has_waker(bfqq); + } + } /* * Periodically reset inject limit, to make sure that @@ -2089,9 +2047,6 @@ static void bfq_add_request(struct request *rq) } } - if (bfq_bfqq_sync(bfqq)) - bfq_update_io_intensity(bfqq, now_ns); - elv_rb_add(&bfqq->sort_list, rq); /* @@ -2397,24 +2352,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { - /* - * If bfqq has been enjoying interactive weight-raising, then - * reset soft_rt_next_start. We do it for the following - * reason. bfqq may have been conveying the I/O needed to load - * a soft real-time application. Such an application actually - * exhibits a soft real-time I/O pattern after it finishes - * loading, and finally starts doing its job. But, if bfqq has - * been receiving a lot of bandwidth so far (likely to happen - * on a fast device), then soft_rt_next_start now contains a - * high value that. So, without this reset, bfqq would be - * prevented from being possibly considered as soft_rt for a - * very long time. - */ - - if (bfqq->wr_cur_max_time != - bfqq->bfqd->bfq_wr_rt_max_time) - bfqq->soft_rt_next_start = jiffies; - if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -2749,16 +2686,10 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; - bic->saved_inject_limit = bfqq->inject_limit; - bic->saved_decrease_time_jif = bfqq->decrease_time_jif; - bic->saved_weight = bfqq->entity.orig_weight; bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_io_start_time = bfqq->io_start_time; - bic->saved_tot_idle_time = bfqq->tot_idle_time; bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && @@ -2781,7 +2712,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_wr_coeff = bfqq->wr_coeff; bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_service_from_wr = bfqq->service_from_wr; bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } @@ -3007,7 +2937,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, } bfqd->in_service_queue = bfqq; - bfqd->in_serv_last_pos = 0; } /* @@ -3513,38 +3442,20 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. - * - * However, as an additional mitigation for this problem, we preserve - * plugging for a special symmetric case that may suddenly turn into - * asymmetric: the case where only bfqq is busy. In this case, not - * expiring bfqq does not cause any harm to any other queues in terms - * of service guarantees. In contrast, it avoids the following unlucky - * sequence of events: (1) bfqq is expired, (2) a new queue with a - * lower weight than bfqq becomes busy (or more queues), (3) the new - * queue is served until a new request arrives for bfqq, (4) when bfqq - * is finally served, there are so many requests of the new queue in - * the drive that the pending requests for bfqq take a lot of time to - * be served. In particular, event (2) may case even already - * dispatched requests of bfqq to be delayed, inside the drive. So, to - * avoid this series of events, the scenario is preventively declared - * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - int tot_busy_queues = bfq_tot_busy_queues(bfqd); - /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < - tot_busy_queues || + bfq_tot_busy_queues(bfqd) || bfqd->rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq) || - tot_busy_queues == 1; + bfq_asymmetric_scenario(bfqd, bfqq); } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -4028,6 +3939,10 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); + if (reason == BFQQE_TOO_IDLE && + entity->service <= 2 * entity->budget / 10) + bfq_clear_bfqq_IO_bound(bfqq); + if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -4037,15 +3952,30 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Therefore we can - * compute soft_rt_next_start. + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. And we do it, unless bfqq is in + * interactive weight raising. We do not do it in the + * latter subcase, for the following reason. bfqq may + * be conveying the I/O needed to load a soft + * real-time application. Such an application will + * actually exhibit a soft real-time I/O pattern after + * it finally starts doing its job. But, if + * soft_rt_next_start is computed here for an + * interactive bfqq, and bfqq had received a lot of + * service before remaining with no outstanding + * request (likely to happen on a fast device), then + * soft_rt_next_start would be assigned such a high + * value that, for a very long time, bfqq would be + * prevented from being possibly considered as soft + * real time. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0) + if (bfqq->dispatched == 0 && + bfqq->wr_coeff != bfqd->bfq_wr_coeff) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4567,9 +4497,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; - else if (bfqq->waker_bfqq && + else if (bfq_bfqq_has_waker(bfqq) && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->waker_bfqq->next_rq && + bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) @@ -4629,21 +4559,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) { - /* - * Either in interactive weight - * raising, or in soft_rt weight - * raising with the - * interactive-weight-raising period - * elapsed (so no switch back to - * interactive weight raising). - */ + bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); - } else { /* - * soft_rt finishing while still in - * interactive period, switch back to - * interactive weight raising - */ + else { switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -4722,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + if (!atomic_read(&hctx->elevator_queued)) + return false; + /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -4971,6 +4892,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; + bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } @@ -5090,8 +5012,6 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", - bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } @@ -5129,8 +5049,6 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync) { - u64 now_ns = ktime_get_ns(); - RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5158,9 +5076,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = now_ns + 1; - - bfqq->io_start_time = now_ns; + bfqq->ttime.last_end_request = ktime_get_ns() + 1; bfq_mark_bfqq_IO_bound(bfqq); @@ -5278,19 +5194,11 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed; + u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; - /* - * We are really interested in how long it takes for the queue to - * become busy when there is no outstanding IO for this queue. So - * ignore cases when the bfq queue has already IO queued. - */ - if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) - return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; + ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); @@ -5305,26 +5213,8 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) { - if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) { - /* - * In soft_rt weight raising with the - * interactive-weight-raising period - * elapsed (so no switch back to - * interactive weight raising). - */ - bfq_bfqq_end_wr(bfqq); - } else { /* - * stopping soft_rt weight raising - * while still in interactive period, - * switch back to interactive weight - * raising - */ - switch_back_to_interactive_wr(bfqq, bfqd); - bfqq->entity.prio_changed = 1; - } - } + BFQQ_TOTALLY_SEEKY(bfqq)) + bfq_bfqq_end_wr(bfqq); } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, @@ -5348,13 +5238,12 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to decide whether - * to mark as has_short_ttime. To this goal, compare average - * think time with half the I/O-plugging timeout. + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); @@ -5668,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); } } @@ -6035,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); + atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -6598,6 +6489,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; + bfqd->bfq_requests_within_timer = 120; + bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index b8e793c34ff1..703895224562 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -291,11 +291,6 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; - /* when bfqq started to do I/O within the last observation window */ - u64 io_start_time; - /* how long bfqq has remained empty during the last observ. window */ - u64 tot_idle_time; - /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -376,11 +371,6 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; - /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ - struct bfq_queue *tentative_waker_bfqq; - /* number of times the same tentative waker has been detected */ - unsigned int num_waker_detections; - /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -417,9 +407,6 @@ struct bfq_io_cq { */ bool saved_IO_bound; - u64 saved_io_start_time; - u64 saved_tot_idle_time; - /* * Same purpose as the previous fields for the value of the * field keeping the queue's belonging to a large burst @@ -445,15 +432,9 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; - unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; - - /* Save also injection state */ - u64 saved_last_serv_time_ns; - unsigned int saved_inject_limit; - unsigned long saved_decrease_time_jif; }; /** @@ -660,6 +641,14 @@ struct bfq_data { */ unsigned int bfq_timeout; + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ + unsigned int bfq_requests_within_timer; + /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput @@ -781,6 +770,7 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ + BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -800,6 +790,7 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 070e34a7feb1..26776bdbdf36 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -137,6 +137,9 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, sd->next_in_service = next_in_service; + if (!next_in_service) + return parent_sched_may_change; + return parent_sched_may_change; } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index dfa652122a2d..c3e5abcfdc98 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,6 +14,8 @@ #include #include "blk.h" +#define BIP_INLINE_VECS 4 + static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; @@ -28,7 +30,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); + bip->bip_slab); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -61,7 +63,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; + inline_vecs = BIP_INLINE_VECS; } if (unlikely(!bip)) @@ -70,11 +72,14 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { - bip->bip_max_vcnt = nr_vecs; - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); + unsigned long idx = 0; + + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + &bs->bvec_integrity_pool); if (!bip->bip_vec) goto err; + bip->bip_max_vcnt = bvec_nr_vecs(idx); + bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; @@ -465,6 +470,6 @@ void __init bio_integrity_init(void) bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, + sizeof(struct bio_vec) * BIP_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } diff --git a/block/bio.c b/block/bio.c index a1c4d2900c7a..4c99ef28fef8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,40 +19,27 @@ #include #include #include -#include #include #include "blk.h" #include "blk-rq-qos.h" -static struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -} bvec_slabs[] __read_mostly = { - { .nr_vecs = 16, .name = "biovec-16" }, - { .nr_vecs = 64, .name = "biovec-64" }, - { .nr_vecs = 128, .name = "biovec-128" }, - { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, -}; +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 -static struct biovec_slab *biovec_slab(unsigned short nr_vecs) -{ - switch (nr_vecs) { - /* smaller bios use inline vecs */ - case 5 ... 16: - return &bvec_slabs[0]; - case 17 ... 64: - return &bvec_slabs[1]; - case 65 ... 128: - return &bvec_slabs[2]; - case 129 ... BIO_MAX_PAGES: - return &bvec_slabs[3]; - default: - BUG(); - return NULL; - } -} +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } +static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { + BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +}; +#undef BV /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by @@ -71,133 +58,178 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static DEFINE_XARRAY(bio_slabs); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; -static struct bio_slab *create_bio_slab(unsigned int size) +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) { - struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - - if (!bslab) - return NULL; - - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); - bslab->slab = kmem_cache_create(bslab->name, size, - ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); - if (!bslab->slab) - goto fail_alloc_slab; - - bslab->slab_ref = 1; - bslab->slab_size = size; - - if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) - return bslab; - - kmem_cache_destroy(bslab->slab); - -fail_alloc_slab: - kfree(bslab); - return NULL; -} - -static inline unsigned int bs_bio_slab_size(struct bio_set *bs) -{ - return bs->front_pad + sizeof(struct bio) + bs->back_pad; -} - -static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) -{ - unsigned int size = bs_bio_slab_size(bs); - struct bio_slab *bslab; + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; + unsigned int i, entry = -1; mutex_lock(&bio_slab_lock); - bslab = xa_load(&bio_slabs, size); - if (bslab) - bslab->slab_ref++; - else - bslab = create_bio_slab(size); - mutex_unlock(&bio_slab_lock); - if (bslab) - return bslab->slab; - return NULL; + i = 0; + while (i < bio_slab_nr) { + bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + new_bio_slab_max = bio_slab_max << 1; + new_bio_slabs = krealloc(bio_slabs, + new_bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) + goto out_unlock; + bio_slab_max = new_bio_slab_max; + bio_slabs = new_bio_slabs; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int slab_size = bs_bio_slab_size(bs); + unsigned int i; mutex_lock(&bio_slab_lock); - bslab = xa_load(&bio_slabs, slab_size); + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; - WARN_ON_ONCE(bslab->slab != bs->bio_slab); - WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; - xa_erase(&bio_slabs, slab_size); - kmem_cache_destroy(bslab->slab); - kfree(bslab); + bslab->slab = NULL; out: mutex_unlock(&bio_slab_lock); } -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) +unsigned int bvec_nr_vecs(unsigned short idx) { - BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); + return bvec_slabs[--idx].nr_vecs; +} - if (nr_vecs == BIO_MAX_PAGES) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) +{ + if (!idx) + return; + idx--; + + BIO_BUG_ON(idx >= BVEC_POOL_NR); + + if (idx == BVEC_POOL_MAX) { mempool_free(bv, pool); - else if (nr_vecs > BIO_INLINE_VECS) - kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); + } else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } } -/* - * Make the first allocation restricted and don't dump info on allocation - * failures, since we'll fall back to the mempool in case of failure. - */ -static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) { - return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; -} + struct bio_vec *bvl; -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, - gfp_t gfp_mask) -{ - struct biovec_slab *bvs = biovec_slab(*nr_vecs); - - if (WARN_ON_ONCE(!bvs)) + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: return NULL; - - /* - * Upgrade the nr_vecs request to take full advantage of the allocation. - * We also rely on this in the bvec_free path. - */ - *nr_vecs = bvs->nr_vecs; - - /* - * Try a slab allocation first for all smaller allocations. If that - * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_PAGES entries. - */ - if (*nr_vecs < BIO_MAX_PAGES) { - struct bio_vec *bvl; - - bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) - return bvl; - *nr_vecs = BIO_MAX_PAGES; } - return mempool_alloc(pool, gfp_mask); + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BVEC_POOL_MAX) { +fallback: + bvl = mempool_alloc(pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + + /* + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. + */ + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + /* + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM + * is set, retry with the 1-entry mempool + */ + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { + *idx = BVEC_POOL_MAX; + goto fallback; + } + } + + (*idx)++; + return bvl; } void bio_uninit(struct bio *bio) @@ -223,7 +255,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -267,8 +299,12 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + bio_uninit(bio); + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -369,97 +405,122 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Allocate a bio from the mempools in @bs. + * Description: + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. * - * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to - * allocate a bio. This is due to the mempool guarantees. To make this work, - * callers must never allocate more than 1 bio at a time from the general pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block driver), - * bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under submit_bio_noacct() - * would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under + * submit_bio_noacct() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * Returns: Pointer to new bio on success, NULL on failure. + * RETURNS: + * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; + unsigned front_pad; + unsigned inline_vecs; + struct bio_vec *bvl = NULL; struct bio *bio; void *p; - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) - return NULL; + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this means if - * we're running beneath it, any bios we allocate and submit will not be - * submitted (and thus freed) until after we return. - * - * This exposes us to a potential deadlock if we allocate multiple bios - * from the same bio_set() while running underneath submit_bio_noacct(). - * If we were to allocate multiple bios (say a stacking block driver - * that was splitting bios), we would deadlock if we exhausted the - * mempool's reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are bios on - * current->bio_list, we first try the allocation without - * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be - * blocking to the rescuer workqueue before we retry with the original - * gfp_flags. - */ - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && + nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath submit_bio_noacct(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. + */ + + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(&bs->bio_pool, gfp_mask); + } + + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; } + if (unlikely(!p)) return NULL; - bio = p + bs->front_pad; - if (nr_iovecs > BIO_INLINE_VECS) { - struct bio_vec *bvl = NULL; + bio = p + front_pad; + bio_init(bio, NULL, 0); - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + if (nr_iovecs > inline_vecs) { + unsigned long idx = 0; + + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); } + if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, nr_iovecs); + bio->bi_flags |= idx << BVEC_POOL_OFFSET; } else if (nr_iovecs) { - bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); - } else { - bio_init(bio, NULL, 0); + bvl = bio->bi_inline_vecs; } bio->bi_pool = bs; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bvl; return bio; err_free: @@ -468,31 +529,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, } EXPORT_SYMBOL(bio_alloc_bioset); -/** - * bio_kmalloc - kmalloc a bio for I/O - * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * - * Use kmalloc to allocate and initialize a bio. - * - * Returns: Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) -{ - struct bio *bio; - - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - if (unlikely(!bio)) - return NULL; - bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); - bio->bi_pool = NULL; - return bio; -} -EXPORT_SYMBOL(bio_kmalloc); - void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; @@ -628,7 +664,7 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); + BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* * most users will be overriding ->bi_bdev with a new target, @@ -638,8 +674,6 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -949,18 +983,21 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) { - WARN_ON_ONCE(bio->bi_max_vecs); + const struct bio_vec *bv = iter->bvec; + unsigned int len; + size_t size; - bio->bi_vcnt = iter->nr_segs; - bio->bi_io_vec = (struct bio_vec *)iter->bvec; - bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; - bio_set_flag(bio, BIO_NO_PAGE_REF); - bio_set_flag(bio, BIO_CLONED); + if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) + return -EINVAL; - iov_iter_advance(iter, iter->count); + len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); + size = bio_add_page(bio, bv->bv_page, len, + bv->bv_offset + iter->iov_offset); + if (unlikely(size != len)) + return -EINVAL; + iov_iter_advance(iter, size); return 0; } @@ -1074,40 +1111,41 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided - * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs - * to ensure the bvecs and pages stay referenced until the submitted I/O is - * completed by a call to ->ki_complete() or returns with an error other than - * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF - * on IO completion. If it isn't, then pages should be released. + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - int ret = 0; + const bool is_bvec = iov_iter_is_bvec(iter); + int ret; - if (iov_iter_is_bvec(iter)) { - if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return -EINVAL; - return bio_iov_bvec_set(bio, iter); - } + if (WARN_ON_ONCE(bio->bi_vcnt)) + return -EINVAL; do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); + if (is_bvec) + bio_set_flag(bio, BIO_NO_PAGE_REF); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1512,7 +1550,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; + struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1565,17 +1603,15 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + bs->front_pad = front_pad; - if (flags & BIOSET_NEED_BVECS) - bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - else - bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(bs); + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); if (!bs->bio_slab) return -ENOMEM; @@ -1618,19 +1654,39 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static int __init init_bio(void) +static void __init biovec_init_slabs(void) { int i; - bio_integrity_init(); - - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + for (i = 0; i < BVEC_POOL_NR; i++) { + int size; struct biovec_slab *bvs = bvec_slabs + i; - bvs->slab = kmem_cache_create(bvs->name, - bvs->nr_vecs * sizeof(struct bio_vec), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (bvs->nr_vecs <= BIO_INLINE_VECS) { + bvs->slab = NULL; + continue; + } + + size = bvs->nr_vecs * sizeof(struct bio_vec); + bvs->slab = kmem_cache_create(bvs->name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } +} + +static int __init init_bio(void) +{ + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), + GFP_KERNEL); + + BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); + + if (!bio_slabs) + panic("bio: can't allocate bios\n"); + + bio_integrity_init(); + biovec_init_slabs(); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a317c03d40f6..34d812d7cd3a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -32,6 +32,8 @@ #include #include "blk.h" +#define MAX_KEY_LEN 100 + /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire @@ -1763,15 +1765,12 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) if (unlikely(current->flags & PF_KTHREAD)) return; - if (current->throttle_queue != q) { - if (!blk_get_queue(q)) - return; - - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; - } + if (!blk_get_queue(q)) + return; + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); diff --git a/block/blk-core.c b/block/blk-core.c index 5e752840b41a..64f69022de96 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -531,7 +531,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (q->id < 0) goto fail_q; - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) goto fail_id; @@ -752,7 +752,7 @@ static int blk_partition_remap(struct bio *bio) bio->bi_iter.bi_sector - p->bd_start_sect); } - bio_set_flag(bio, BIO_REMAPPED); + bio->bi_bdev = bdev_whole(p); return 0; } @@ -815,12 +815,10 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) goto end_io; if (unlikely(bio_check_ro(bio))) goto end_io; - if (!bio_flagged(bio, BIO_REMAPPED)) { - if (unlikely(bio_check_eod(bio))) - goto end_io; - if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) - goto end_io; - } + if (unlikely(bio_check_eod(bio))) + goto end_io; + if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio))) + goto end_io; /* * Filter flush bio's early so that bio based drivers without flush @@ -1299,11 +1297,7 @@ void blk_account_io_start(struct request *rq) if (!blk_do_io_stat(rq)) return; - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; + rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_stat_lock(); update_io_ticks(rq->part, jiffies, false); @@ -1326,17 +1320,14 @@ static unsigned long __part_start_io_acct(struct block_device *part, return now; } -/** - * bio_start_io_acct - start I/O accounting for bio based drivers - * @bio: bio to start account for - * - * Returns the start time that should be passed back to bio_end_io_acct(). - */ -unsigned long bio_start_io_acct(struct bio *bio) +unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, + struct bio *bio) { - return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); + + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); } -EXPORT_SYMBOL_GPL(bio_start_io_acct); +EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) @@ -1359,12 +1350,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op, part_stat_unlock(); } -void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, - struct block_device *orig_bdev) +void part_end_io_acct(struct block_device *part, struct bio *bio, + unsigned long start_time) { - __part_end_io_acct(orig_bdev, bio_op(bio), start_time); + __part_end_io_acct(part, bio_op(bio), start_time); } -EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); +EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 07bed8ae2bcd..9f9c4ddc76a5 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -164,12 +164,10 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) struct bio_vec bv; struct bio *bio; - bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-exec.c b/block/blk-exec.c index beae70a0e5e5..85324d53d072 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -31,7 +31,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) } /** - * blk_execute_rq_nowait - insert a request to I/O scheduler for execution + * blk_execute_rq_nowait - insert a request into queue for execution + * @q: queue to insert the request in * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -44,8 +45,9 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) * Note: * This function will invoke @done directly if the queue is dead. */ -void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, - int at_head, rq_end_io_fn *done) +void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head, + rq_end_io_fn *done) { WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -65,6 +67,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); /** * blk_execute_rq - insert a request into queue for execution + * @q: queue to insert the request in * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -73,13 +76,14 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. */ -void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) +void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); + blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; diff --git a/block/blk-flush.c b/block/blk-flush.c index 7942ca6ed321..76c1624cb06c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,18 +432,23 @@ void blk_insert_flush(struct request *rq) /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { - struct bio bio; + struct bio *bio; + int ret = 0; - bio_init(&bio, NULL, 0); - bio_set_dev(&bio, bdev); - bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - return submit_bio_wait(&bio); + bio = bio_alloc(gfp_mask, 0); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; } EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-mq.c b/block/blk-mq.c index f21d922ecfaf..74b17b396f4c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1646,42 +1646,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue); -/* - * Is the request queue handled by an IO scheduler that does not respect - * hardware queues when dispatching? - */ -static bool blk_mq_has_sqsched(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.dispatch_request && - !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) - return true; - return false; -} - -/* - * Return prefered queue to dispatch from (if any) for non-mq aware IO - * scheduler. - */ -static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - - /* - * If the IO scheduler does not respect hardware queues when - * dispatching, we just don't bother with multiple HW queues and - * dispatch from hctx for the current CPU since running multiple queues - * just causes lock contention inside the scheduler and pointless cache - * bouncing. - */ - hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, - raw_smp_processor_id()); - if (!blk_mq_hctx_stopped(hctx)) - return hctx; - return NULL; -} - /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. @@ -1689,23 +1653,14 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx, *sq_hctx; + struct blk_mq_hw_ctx *hctx; int i; - sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) - sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - /* - * Dispatch from this hctx either if there's no hctx preferred - * by IO scheduler or if it has requests that bypass the - * scheduler. - */ - if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) - blk_mq_run_hw_queue(hctx, async); + + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1717,23 +1672,14 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx, *sq_hctx; + struct blk_mq_hw_ctx *hctx; int i; - sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) - sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - /* - * Dispatch from this hctx either if there's no hctx preferred - * by IO scheduler or if it has requests that bypass the - * scheduler. - */ - if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) - blk_mq_delay_run_hw_queue(hctx, msecs); + + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); @@ -2707,6 +2653,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); + atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/blk-settings.c b/block/blk-settings.c index 7dd8be314ac6..43990b1d148b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -60,7 +60,6 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; - lim->zone_write_granularity = 0; } EXPORT_SYMBOL(blk_set_default_limits); @@ -367,28 +366,6 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) } EXPORT_SYMBOL(blk_queue_physical_block_size); -/** - * blk_queue_zone_write_granularity - set zone write granularity for the queue - * @q: the request queue for the zoned device - * @size: the zone write granularity size, in bytes - * - * Description: - * This should be set to the lowest possible size allowing to write in - * sequential zones of a zoned block device. - */ -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size) -{ - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return; - - q->limits.zone_write_granularity = size; - - if (q->limits.zone_write_granularity < q->limits.logical_block_size) - q->limits.zone_write_granularity = q->limits.logical_block_size; -} -EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); - /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device @@ -654,8 +631,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } - t->zone_write_granularity = max(t->zone_write_granularity, - b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); return ret; } @@ -872,8 +847,6 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); */ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { - struct request_queue *q = disk->queue; - switch (model) { case BLK_ZONED_HM: /* @@ -892,7 +865,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - !xa_empty(&disk->part_tbl)) + disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: @@ -902,17 +875,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) break; } - q->limits.zoned = model; - if (model != BLK_ZONED_NONE) { - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - blk_queue_zone_write_granularity(q, - queue_logical_block_size(q)); - } else { - blk_queue_clear_zone_settings(q); - } + disk->queue->limits.zoned = model; } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ae39c7f3d83d..b513f1683af0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -219,12 +219,6 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } -static ssize_t queue_zone_write_granularity_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_zone_write_granularity(q), page); -} - static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -591,7 +585,6 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); @@ -646,7 +639,6 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, - &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 42aed0160f86..0321ca83e73f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -518,7 +518,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct bio *bio) +static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -545,7 +545,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; - } else if (wbt_should_throttle(bio)) { + } else if (wbt_should_throttle(rwb, bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 833978c02e60..7a68b6e4300c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -549,20 +549,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - -void blk_queue_clear_zone_settings(struct request_queue *q) -{ - blk_mq_freeze_queue(q); - - blk_queue_free_zone_bitmaps(q); - blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); - q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; - q->nr_zones = 0; - q->max_open_zones = 0; - q->max_active_zones = 0; - q->limits.chunk_sectors = 0; - q->limits.zone_write_granularity = 0; - q->limits.max_zone_append_sectors = 0; - - blk_mq_unfreeze_queue(q); -} diff --git a/block/blk.h b/block/blk.h index 3b53e44b967e..10ab7c0d0766 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,11 +55,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); -#define BIO_INLINE_VECS 4 -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, - gfp_t gfp_mask); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); - static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { @@ -334,12 +329,12 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); -void blk_queue_clear_zone_settings(struct request_queue *q); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} -static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); @@ -352,6 +347,7 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); +int disk_expand_part_tbl(struct gendisk *disk, int target); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/bounce.c b/block/bounce.c index fc55314aa426..a22a8a1942b2 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -247,8 +247,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/bsg.c b/block/bsg.c index bd10922d5cbb..d7bae94b64d9 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -157,10 +157,8 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) return PTR_ERR(rq); ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); - if (ret) { - blk_put_request(rq); + if (ret) return ret; - } rq->timeout = msecs_to_jiffies(hdr.timeout); if (!rq->timeout) @@ -183,7 +181,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); blk_rq_unmap_user(bio); diff --git a/block/genhd.c b/block/genhd.c index 36ff45bbaaaf..55f85c63d687 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -162,6 +162,15 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } +static struct block_device *__disk_get_part(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); + + if (unlikely(partno < 0 || partno >= ptbl->len)) + return NULL; + return rcu_dereference(ptbl->part[partno]); +} + /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -176,14 +185,26 @@ static void part_in_flight_rw(struct block_device *part, void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags) { + struct disk_part_tbl *ptbl; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + piter->disk = disk; piter->part = NULL; - if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) + + if (flags & DISK_PITER_REVERSE) + piter->idx = ptbl->len - 1; + else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) piter->idx = 0; else piter->idx = 1; + piter->flags = flags; + + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(disk_part_iter_init); /** * disk_part_iter_next - proceed iterator to the next partition and return it @@ -196,30 +217,57 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, */ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { - struct block_device *part; - unsigned long idx; + struct disk_part_tbl *ptbl; + int inc, end; /* put the last partition */ disk_part_iter_exit(piter); + /* get part_tbl */ rcu_read_lock(); - xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) { - if (!bdev_nr_sectors(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) - continue; + ptbl = rcu_dereference(piter->disk->part_tbl); + /* determine iteration parameters */ + if (piter->flags & DISK_PITER_REVERSE) { + inc = -1; + if (piter->flags & (DISK_PITER_INCL_PART0 | + DISK_PITER_INCL_EMPTY_PART0)) + end = -1; + else + end = 0; + } else { + inc = 1; + end = ptbl->len; + } + + /* iterate to the next partition */ + for (; piter->idx != end; piter->idx += inc) { + struct block_device *part; + + part = rcu_dereference(ptbl->part[piter->idx]); + if (!part) + continue; piter->part = bdgrab(part); if (!piter->part) continue; - piter->idx = idx + 1; + if (!bdev_nr_sectors(part) && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) { + bdput(piter->part); + piter->part = NULL; + continue; + } + + piter->idx += inc; break; } + rcu_read_unlock(); return piter->part; } +EXPORT_SYMBOL_GPL(disk_part_iter_next); /** * disk_part_iter_exit - finish up partition iteration @@ -236,6 +284,91 @@ void disk_part_iter_exit(struct disk_part_iter *piter) bdput(piter->part); piter->part = NULL; } +EXPORT_SYMBOL_GPL(disk_part_iter_exit); + +static inline int sector_in_part(struct block_device *part, sector_t sector) +{ + return part->bd_start_sect <= sector && + sector < part->bd_start_sect + bdev_nr_sectors(part); +} + +/** + * disk_map_sector_rcu - map sector to partition + * @disk: gendisk of interest + * @sector: sector to map + * + * Find out which partition @sector maps to on @disk. This is + * primarily used for stats accounting. + * + * CONTEXT: + * RCU read locked. + * + * RETURNS: + * Found partition on success, part0 is returned if no partition matches + * or the matched partition is being deleted. + */ +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +{ + struct disk_part_tbl *ptbl; + struct block_device *part; + int i; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + part = rcu_dereference(ptbl->last_lookup); + if (part && sector_in_part(part, sector)) + goto out_unlock; + + for (i = 1; i < ptbl->len; i++) { + part = rcu_dereference(ptbl->part[i]); + if (part && sector_in_part(part, sector)) { + rcu_assign_pointer(ptbl->last_lookup, part); + goto out_unlock; + } + } + + part = disk->part0; +out_unlock: + rcu_read_unlock(); + return part; +} + +/** + * disk_has_partitions + * @disk: gendisk of interest + * + * Walk through the partition table and check if valid partition exists. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * True if the gendisk has at least one valid non-zero size partition. + * Otherwise false. + */ +bool disk_has_partitions(struct gendisk *disk) +{ + struct disk_part_tbl *ptbl; + int i; + bool ret = false; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + /* Iterate partitions skipping the whole device at index 0 */ + for (i = 1; i < ptbl->len; i++) { + if (rcu_dereference(ptbl->part[i])) { + ret = true; + break; + } + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(disk_has_partitions); /* * Can be deleted altogether. Later. @@ -471,18 +604,6 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -void disk_uevent(struct gendisk *disk, enum kobject_action action) -{ - struct disk_part_iter piter; - struct block_device *part; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), action); - disk_part_iter_exit(&piter); -} -EXPORT_SYMBOL_GPL(disk_uevent); - static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -500,6 +621,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct block_device *part; int err; ddev->parent = parent; @@ -542,9 +665,15 @@ static void register_disk(struct device *parent, struct gendisk *disk, disk_scan_partitions(disk); - /* announce the disk and partitions after all partitions are created */ + /* announce disk after possible partitions are created */ dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(bdev_kobj(part), KOBJ_ADD); + disk_part_iter_exit(&piter); if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, @@ -700,7 +829,8 @@ void del_gendisk(struct gendisk *disk) down_write(&bdev_lookup_sem); /* invalidate stuff */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); delete_partition(part); @@ -799,7 +929,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) struct block_device *bdev = NULL; rcu_read_lock(); - bdev = xa_load(&disk->part_tbl, partno); + bdev = __disk_get_part(disk, partno); if (bdev && !bdgrab(bdev)) bdev = NULL; rcu_read_unlock(); @@ -1189,6 +1319,83 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; +/** + * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way + * @disk: disk to replace part_tbl for + * @new_ptbl: new part_tbl to install + * + * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The + * original ptbl is freed using RCU callback. + * + * LOCKING: + * Matching bd_mutex locked or the caller is the only user of @disk. + */ +static void disk_replace_part_tbl(struct gendisk *disk, + struct disk_part_tbl *new_ptbl) +{ + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + + rcu_assign_pointer(disk->part_tbl, new_ptbl); + + if (old_ptbl) { + rcu_assign_pointer(old_ptbl->last_lookup, NULL); + kfree_rcu(old_ptbl, rcu_head); + } +} + +/** + * disk_expand_part_tbl - expand disk->part_tbl + * @disk: disk to expand part_tbl for + * @partno: expand such that this partno can fit in + * + * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl + * uses RCU to allow unlocked dereferencing for stats and other stuff. + * + * LOCKING: + * Matching bd_mutex locked or the caller is the only user of @disk. + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int disk_expand_part_tbl(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + struct disk_part_tbl *new_ptbl; + int len = old_ptbl ? old_ptbl->len : 0; + int i, target; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; + + /* disk_max_parts() is zero during initialization, ignore if so */ + if (disk_max_parts(disk) && target > disk_max_parts(disk)) + return -EINVAL; + + if (target <= len) + return 0; + + new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, + disk->node_id); + if (!new_ptbl) + return -ENOMEM; + + new_ptbl->len = target; + + for (i = 0; i < len; i++) + rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + + disk_replace_part_tbl(disk, new_ptbl); + return 0; +} + /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1212,7 +1419,7 @@ static void disk_release(struct device *dev) blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); - xa_destroy(&disk->part_tbl); + disk_replace_part_tbl(disk, NULL); bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); @@ -1365,6 +1572,7 @@ dev_t blk_lookup_devt(const char *name, int partno) struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; + struct disk_part_tbl *ptbl; if (minors > DISK_MAX_PARTS) { printk(KERN_ERR @@ -1382,9 +1590,11 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_free_disk; disk->node_id = node_id; - xa_init(&disk->part_tbl); - if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) - goto out_destroy_part_tbl; + if (disk_expand_part_tbl(disk, 0)) + goto out_bdput; + + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], disk->part0); disk->minors = minors; rand_initialize_disk(disk); @@ -1393,8 +1603,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_destroy_part_tbl: - xa_destroy(&disk->part_tbl); +out_bdput: bdput(disk->part0); out_free_disk: kfree(disk); @@ -1432,7 +1641,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on - * @read_only: %true to set the disk read-only, %false set the disk read/write + * @ready_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c25c41d0d061..dc89199bc8c6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -1029,7 +1029,6 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", - .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, }; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b57470e154c8..800ac902809b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,6 +386,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); + if (rq) + atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } @@ -533,6 +535,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -579,6 +582,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + if (!atomic_read(&hctx->elevator_queued)) + return false; + return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/block/partitions/core.c b/block/partitions/core.c index f3d9ff2cafb6..5e916fa1d6dd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -289,7 +289,13 @@ struct device_type part_type = { */ void delete_partition(struct block_device *part) { - xa_erase(&part->bd_disk->part_tbl, part->bd_partno); + struct gendisk *disk = part->bd_disk; + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + + rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -321,6 +327,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; + struct disk_part_tbl *ptbl; const char *dname; int err; @@ -336,13 +343,18 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); - blk_queue_set_zoned(disk, BLK_ZONED_NONE); + disk->queue->limits.zoned = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: break; } - if (xa_load(&disk->part_tbl, partno)) + err = disk_expand_part_tbl(disk, partno); + if (err) + return ERR_PTR(err); + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + + if (ptbl->part[partno]) return ERR_PTR(-EBUSY); bdev = bdev_alloc(disk, partno); @@ -395,10 +407,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ - err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); - if (err) - goto out_del; bdev_add(bdev, devt); + rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) @@ -604,7 +614,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state; - int ret = -EAGAIN, p; + int ret = -EAGAIN, p, highest; if (!disk_part_scan_enabled(disk)) return 0; @@ -652,6 +662,15 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + /* + * Detect the highest partition number and preallocate disk->part_tbl. + * This is an optimization and not strictly necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + disk_expand_part_tbl(disk, highest); + for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, bdev, state, p)) goto out_free_state; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 6599bac0a78c..c9f009cc0446 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, * (if he doesn't check that is his problem). * N.B. a non-zero SCSI status is _not_ necessarily an error. */ - blk_execute_rq(bd_disk, rq, at_head); + blk_execute_rq(q, bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; } - blk_execute_rq(disk, rq, 0); + blk_execute_rq(q, disk, rq, 0); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { @@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; scsi_req(rq)->cmd_len = 6; - blk_execute_rq(bd_disk, rq, 0); + blk_execute_rq(q, bd_disk, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 72cf7603d51f..7227fc7ab8ed 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -138,7 +138,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, op_flags |= REQ_FUA | REQ_PREFLUSH; op_flags |= REQ_SYNC; - bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); + bio = bio_alloc_drbd(GFP_NOIO); bio_set_dev(bio, bdev->md_bdev); bio->bi_iter.bi_sector = sector; err = -EIO; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index c1f816f896a8..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -976,7 +976,7 @@ static void drbd_bm_endio(struct bio *bio) static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; struct page *page; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 02db50d7e4c6..b2c93a29c251 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1422,6 +1422,8 @@ extern mempool_t drbd_md_io_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ extern struct bio_set drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); /* And a bio_set for cloning */ extern struct bio_set drbd_io_bio_set; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 788dd97e6026..1c8c18b2a25f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -138,6 +138,19 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!bioset_initialized(&drbd_md_io_bio_set)) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set); + if (!bio) + return NULL; + return bio; +} + #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will give tons of false positives. When this is a real functions sparse works. diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9dbb660a7d7c..ea0f31ab3343 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,10 +30,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); - req->private_bio->bi_private = req; - req->private_bio->bi_end_io = drbd_request_endio; - + drbd_req_make_private_bio(req, bio_src); req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 511f39a08de4..55bb0f8721fa 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -256,6 +256,18 @@ enum drbd_req_state_bits { #define MR_WRITE 1 #define MR_READ 2 +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) +{ + struct bio *bio; + bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); + + req->private_bio = bio; + + bio->bi_private = req; + bio->bi_end_io = drbd_request_endio; + bio->bi_next = NULL; +} + /* Short lived temporary struct on the stack. * We could squirrel the error to be returned into * bio->bi_iter.bi_size, or similar. But that would be too ugly. */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 64563bfdf0da..02044ab7f767 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1523,11 +1523,8 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) drbd_al_begin_io(device, &req->i); - req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO, - &drbd_io_bio_set); + drbd_req_make_private_bio(req, req->master_bio); bio_set_dev(req->private_bio, device->ldev->backing_bdev); - req->private_bio->bi_private = req; - req->private_bio->bi_end_io = drbd_request_endio; submit_bio_noacct(req->private_bio); return 0; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3fd99836bb1c..53ac59d19ae5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(NULL, rq, true); + blk_execute_rq(rq->q, NULL, rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index bfcab1c782b5..fce0a54df0e5 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -148,6 +148,10 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) sector += dev->zone_size_sects; } + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + return 0; } @@ -156,10 +160,6 @@ int null_register_zoned_dev(struct nullb *nullb) struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; - blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - if (queue_is_mq(q)) { int ret = blk_revalidate_disk_zones(nullb->disk, NULL); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 897acda20ac8..a7af4f27b7c3 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd, rq, 0); + blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); blk_put_request(rq); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index fc4b0f1aa86d..658a0981cb54 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(pd->bdev->bd_disk, rq, 0); + blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); if (scsi_req(rq)->result) ret = -EIO; out: diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 2cdf2771f8e8..4478eb7efee0 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); return 0; @@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); return 0; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 37c409f3cd83..db3903e24d78 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -322,7 +322,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk, req, false); + blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_put_request(req); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 90ad34c6ef8e..8f0e52a71493 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, rq->timeout = 60 * HZ; bio = rq->bio; - blk_execute_rq(cdi->disk, rq, 0); + blk_execute_rq(q, cdi->disk, rq, 0); if (scsi_req(rq)->result) { struct scsi_sense_hdr sshdr; diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index a1ce9f5ac3aa..013ad33fbbc8 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, memcpy(scsi_req(rq)->cmd, pc->c, 12); if (drive->media == ide_tape) scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1; - blk_execute_rq(disk, rq, 0); + blk_execute_rq(drive->queue, disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index cffbcc27a34c..25d2d88e82ad 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, } } - blk_execute_rq(info->disk, rq, 0); + blk_execute_rq(drive->queue, info->disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; if (buffer) diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 011eab9c69b7..46f2df288c6a 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; - blk_execute_rq(cd->disk, rq, 0); + blk_execute_rq(drive->queue, cd->disk, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); /* diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index ca1d4b3d3878..f2f93ed40356 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, *(int *)&scsi_req(rq)->cmd[1] = arg; ide_req(rq)->special = setting->set; - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(q, NULL, rq, 0); ret = scsi_req(rq)->result; blk_put_request(rq); diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 8413731c6259..34b9441084f8 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg) drive->mult_req = arg; drive->special_flags |= IDE_SFLAG_SET_MULTMODE; - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(drive->queue, NULL, rq, 0); blk_put_request(rq); return (drive->mult_count == arg) ? 0 : -EIO; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 43fbc37d85c3..58994da10c06 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(drive->queue, NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive) ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; - blk_execute_rq(NULL, rq, 1); + blk_execute_rq(drive->queue, NULL, rq, 1); ret = scsi_req(rq)->result; blk_put_request(rq); return ret; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index a80a0f28f7b9..8af7af6001eb 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; ide_req(rq)->special = &timeout; - blk_execute_rq(NULL, rq, 1); + blk_execute_rq(q, NULL, rq, 1); rc = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); if (rc) diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index d680b3e3295f..82ab308f1aaf 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) mesg.event = PM_EVENT_FREEZE; rqpm.pm_state = mesg.event; - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(drive->queue, NULL, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq) blk_mq_end_request(rq, BLK_STS_OK); return -ENXIO; } - blk_execute_rq(NULL, rq, true); + blk_execute_rq(q, NULL, rq, true); return scsi_req(rq)->result ? -EIO : 0; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index fa05e7e7d609..88b96437b22e 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) goto out_put; } - blk_execute_rq(tape->disk, rq, 0); + blk_execute_rq(drive->queue, tape->disk, rq, 0); /* calculate the number of transferred bytes and update buffer state */ size -= scsi_req(rq)->resid_len; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 6665fc4724b9..d016cbe68cba 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, ide_req(rq)->special = cmd; cmd->rq = rq; - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(drive->queue, NULL, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 63e809f38e3f..058dd8014428 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; - bio_set_dev(check, bio->bi_bdev); + check->bi_bdev = bio->bi_bdev; check->bi_opf = REQ_OP_READ; check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 29c231758293..dfc35d6d05ed 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct block_device *orig_bdev; + struct block_device *part; unsigned long start_time; struct btree_op op; @@ -670,8 +670,8 @@ static void bio_complete(struct search *s) { if (s->orig_bio) { /* Count on bcache device */ - bio_end_io_acct_remapped(s->orig_bio, s->start_time, - s->orig_bdev); + part_end_io_acct(s->part, s->orig_bio, s->start_time); + trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -714,8 +714,7 @@ static void search_free(struct closure *cl) } static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d, struct block_device *orig_bdev, - unsigned long start_time) + struct bcache_device *d) { struct search *s; @@ -733,8 +732,7 @@ static inline struct search *search_alloc(struct bio *bio, s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; /* Count on the bcache device */ - s->orig_bdev = orig_bdev; - s->start_time = start_time; + s->start_time = part_start_io_acct(d->disk, &s->part, bio); s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -1076,7 +1074,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct block_device *orig_bdev; + struct block_device *part; }; static void detached_dev_end_io(struct bio *bio) @@ -1088,7 +1086,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_private = ddip->bi_private; /* Count on the bcache device */ - bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); + part_end_io_acct(ddip->part, bio, ddip->start_time); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1101,8 +1099,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io(bio); } -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, - struct block_device *orig_bdev, unsigned long start_time) +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) { struct detached_dev_io_private *ddip; struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1115,8 +1112,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; /* Count on the bcache device */ - ddip->orig_bdev = orig_bdev; - ddip->start_time = start_time; + ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio); ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; @@ -1172,10 +1168,8 @@ static void quit_max_writeback_rate(struct cache_set *c, blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct block_device *orig_bdev = bio->bi_bdev; - struct bcache_device *d = orig_bdev->bd_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); - unsigned long start_time; int rw = bio_data_dir(bio); if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || @@ -1200,13 +1194,11 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } - start_time = bio_start_io_acct(bio); - bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { - s = search_alloc(bio, d, orig_bdev, start_time); + s = search_alloc(bio, d); trace_bcache_request_start(s->d, bio); if (!bio->bi_iter.bi_size) { @@ -1227,7 +1219,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } else /* I/O request sent to backing device */ - detached_dev_do_request(d, bio, orig_bdev, start_time); + detached_dev_do_request(d, bio); return BLK_QC_T_NONE; } @@ -1291,7 +1283,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); + s = search_alloc(bio, d); cl = &s->cl; bio = &s->bio.bio; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 193fe7652329..2047a9cccdb5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1939,7 +1939,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) goto err; if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_RESCUER)) + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto err; c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index a90bdf9b2ca6..bdb255edc200 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -85,6 +85,12 @@ struct clone { struct dm_clone_metadata *cmd; + /* + * bio used to flush the destination device, before committing the + * metadata. + */ + struct bio flush_bio; + /* Region hydration hash table */ struct hash_table_bucket *ht; @@ -1149,7 +1155,11 @@ static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) goto out; } - r = blkdev_issue_flush(clone->dest_dev->bdev); + bio_reset(&clone->flush_bio); + bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev); + clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + r = submit_bio_wait(&clone->flush_bio); if (unlikely(r)) { __metadata_operation_failed(clone, "flush destination device", r); goto out; @@ -1876,6 +1886,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&clone->deferred_flush_completions); clone->hydration_offset = 0; atomic_set(&clone->hydrations_in_flight, 0); + bio_init(&clone->flush_bio, NULL, 0); clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!clone->wq) { @@ -1947,6 +1958,7 @@ static void clone_dtr(struct dm_target *ti) struct clone *clone = ti->private; mutex_destroy(&clone->commit_lock); + bio_uninit(&clone->flush_bio); for (i = 0; i < clone->nr_ctr_args; i++) kfree(clone->ctr_args[i]); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 039d17b28938..b298fefb022e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(dev->bdev); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); return ret; } @@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(dev->bdev); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); return ret; } @@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(dev->bdev); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); goto err; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 21da0c48f6c2..cf06dbb1aa53 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -340,6 +340,24 @@ static int start_readonly; */ static bool create_on_open = true; +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev) +{ + if (!mddev || !bioset_initialized(&mddev->bio_set)) + return bio_alloc(gfp_mask, nr_iovecs); + + return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set); +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +static struct bio *md_bio_alloc_sync(struct mddev *mddev) +{ + if (!mddev || !bioset_initialized(&mddev->sync_set)) + return bio_alloc(GFP_NOIO, 1); + + return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); +} + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -445,8 +463,8 @@ struct md_io { struct mddev *mddev; bio_end_io_t *orig_bi_end_io; void *orig_bi_private; - struct block_device *orig_bi_bdev; unsigned long start_time; + struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -454,7 +472,7 @@ static void md_end_io(struct bio *bio) struct md_io *md_io = bio->bi_private; struct mddev *mddev = md_io->mddev; - bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev); + part_end_io_acct(md_io->part, bio, md_io->start_time); bio->bi_end_io = md_io->orig_bi_end_io; bio->bi_private = md_io->orig_bi_private; @@ -496,12 +514,12 @@ static blk_qc_t md_submit_bio(struct bio *bio) md_io->mddev = mddev; md_io->orig_bi_end_io = bio->bi_end_io; md_io->orig_bi_private = bio->bi_private; - md_io->orig_bi_bdev = bio->bi_bdev; bio->bi_end_io = md_end_io; bio->bi_private = md_io; - md_io->start_time = bio_start_io_acct(bio); + md_io->start_time = part_start_io_acct(mddev->gendisk, + &md_io->part, bio); } /* bio could be mergeable after passing to underlayer */ @@ -595,7 +613,7 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); @@ -981,7 +999,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); + bio = md_bio_alloc_sync(mddev); atomic_inc(&rdev->nr_pending); @@ -1013,29 +1031,29 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio bio; - struct bio_vec bvec; - - bio_init(&bio, &bvec, 1); + struct bio *bio = md_bio_alloc_sync(rdev->mddev); + int ret; if (metadata_op && rdev->meta_bdev) - bio_set_dev(&bio, rdev->meta_bdev); + bio_set_dev(bio, rdev->meta_bdev); else - bio_set_dev(&bio, rdev->bdev); - bio.bi_opf = op | op_flags; + bio_set_dev(bio, rdev->bdev); + bio_set_op_attrs(bio, op, op_flags); if (metadata_op) - bio.bi_iter.bi_sector = sector + rdev->sb_start; + bio->bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) - bio.bi_iter.bi_sector = sector + rdev->new_data_offset; + bio->bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio.bi_iter.bi_sector = sector + rdev->data_offset; - bio_add_page(&bio, page, size, 0); + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio_add_page(bio, page, size, 0); - submit_bio_wait(&bio); + submit_bio_wait(bio); - return !bio.bi_status; + ret = !bio->bi_status; + bio_put(bio); + return ret; } EXPORT_SYMBOL_GPL(sync_page_io); @@ -2399,12 +2417,6 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_add_rdev); -static bool rdev_read_only(struct md_rdev *rdev) -{ - return bdev_read_only(rdev->bdev) || - (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); -} - static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; @@ -2414,7 +2426,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - if (rdev_read_only(rdev) && mddev->pers) + if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && + mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ @@ -5848,7 +5861,9 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); - if (mddev->ro != 1 && rdev_read_only(rdev)) { + if (mddev->ro != 1 && + (bdev_read_only(rdev->bdev) || + bdev_read_only(rdev->meta_bdev))) { mddev->ro = 1; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); @@ -6143,7 +6158,7 @@ static int restart_array(struct mddev *mddev) if (test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) has_journal = true; - if (rdev_read_only(rdev)) + if (bdev_read_only(rdev->bdev)) has_readonly = true; } rcu_read_unlock(); diff --git a/drivers/md/md.h b/drivers/md/md.h index bcbba1b5ec4a..f13290ccc1c2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -742,6 +742,8 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); +extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d2378765dc15..3b19141cdb4b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1104,7 +1104,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, int i = 0; struct bio *behind_bio = NULL; - behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set); + behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); if (!behind_bio) return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a9ae7d113492..be8f14afb6d1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4531,7 +4531,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, return sectors_done; } - read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set); + read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); bio_set_dev(read_bio, rdev->bdev); read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr @@ -4539,6 +4539,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); + read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); + read_bio->bi_status = 0; + read_bio->bi_vcnt = 0; + read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e8c118e05dfd..d0f540296fe9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev); + ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); out: __free_page(page); return ret; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a348b2adf2a9..f411b9e5c332 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5393,72 +5393,90 @@ static void raid5_align_endio(struct bio *bi) static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; - struct bio *align_bio; + int dd_idx; + struct bio* align_bi; struct md_rdev *rdev; - sector_t sector, end_sector, first_bad; - int bad_sectors, dd_idx; + sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); return 0; } + /* + * use bio_clone_fast to make a copy of the bio + */ + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_iter.bi_sector = + raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, + 0, &dd_idx, NULL); - sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, - &dd_idx, NULL); - end_sector = bio_end_sector(raid_bio); - + end_sector = bio_end_sector(align_bi); rcu_read_lock(); - if (r5c_big_stripe_cached(conf, sector)) - goto out_rcu_unlock; - rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (!rdev) - goto out_rcu_unlock; - if (test_bit(Faulty, &rdev->flags) || + if (rdev && + (test_bit(Faulty, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) || - rdev->recovery_offset >= end_sector)) - goto out_rcu_unlock; + rdev->recovery_offset >= end_sector))) + rdev = NULL; } - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); - bio_set_dev(align_bio, rdev->bdev); - align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = raid_bio; - align_bio->bi_iter.bi_sector = sector; - - raid_bio->bi_next = (void *)rdev; - - if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, - &bad_sectors)) { - bio_put(align_bio); - rdev_dec_pending(rdev, mddev); + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); return 0; } - /* No reshape active, so we can trust rdev->data_offset */ - align_bio->bi_iter.bi_sector += rdev->data_offset; + if (rdev) { + sector_t first_bad; + int bad_sectors; - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + bio_set_dev(align_bi, rdev->bdev); - if (mddev->gendisk) - trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); - submit_bio_noacct(align_bio); - return 1; + if (is_badblock(rdev, align_bi->bi_iter.bi_sector, + bio_sectors(align_bi), + &first_bad, &bad_sectors)) { + bio_put(align_bi); + rdev_dec_pending(rdev, mddev); + return 0; + } -out_rcu_unlock: - rcu_read_unlock(); - return 0; + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_iter.bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, + conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + + if (mddev->gendisk) + trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); + submit_bio_noacct(align_bi); + return 1; + } else { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } } static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index a1d6b68320ae..42e27a298218 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev, goto out_put; } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(mq->queue, NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); @@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(mq->queue, NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); blk_put_request(req); @@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(mq->queue, NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; /* copy to user if data and response */ @@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(mq->queue, NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; if (ret >= 0) { *val = ret; @@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD; req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(mq->queue, NULL, req, 0); err = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); if (err) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 89cacff897a8..ad6a4ae5ed6f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q, rq->cmd_flags |= REQ_HIPRI; rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq); + blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); while (!completion_done(&wait)) { blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); @@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (poll) nvme_execute_rq_polled(req->q, NULL, req, at_head); else - blk_execute_rq(NULL, req, at_head); + blk_execute_rq(req->q, NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq) u32 effects; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(disk, rq, 0); + blk_execute_rq(rq->q, disk, rq, 0); nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); @@ -1133,8 +1133,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); + bio->bi_bdev = bdev; if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); @@ -1203,7 +1202,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; - blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); + blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); return 0; } @@ -2176,18 +2175,17 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); - ret = nvme_configure_metadata(ns, id); - if (ret) - goto out_unfreeze; - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->disk, ns, id); - if (ns->head->ids.csi == NVME_CSI_ZNS) { ret = nvme_update_zone_info(ns, lbaf); if (ret) goto out_unfreeze; } + ret = nvme_configure_metadata(ns, id); + if (ret) + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b705988629f2..6c8eab8de288 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, rq->end_io_data = rqd; - blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); + blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); return 0; @@ -816,10 +816,10 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - bio_set_dev(bio, ns->disk->part0); + bio->bi_bdev = ns->disk->part0; } - blk_execute_rq(NULL, rq, 0); + blk_execute_rq(q, NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1427c9555cef..7f63a51a53df 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (likely(ns)) { - bio_set_dev(bio, ns->disk->part0); + bio->bi_bdev = ns->disk->part0; bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); @@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work) * Reset disk to the mpath node and resubmit to select a new * path. */ - bio_set_dev(bio, head->disk->part0); + bio->bi_bdev = head->disk->part0; submit_bio_noacct(bio); } } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 808ab5186928..6bad4d4dcdf0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } abort_req->end_io_data = NULL; - blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); + blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); /* * The aborted req will be completed on receiving the abort req. @@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(NULL, req, false, + blk_execute_rq_nowait(q, NULL, req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index c7e3ec561ba0..1dfe9a3500e3 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -9,7 +9,13 @@ int nvme_revalidate_zones(struct nvme_ns *ns) { - return blk_revalidate_disk_zones(ns->disk, NULL); + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; } static int nvme_set_max_append(struct nvme_ctrl *ctrl) @@ -103,11 +109,10 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) goto free_data; } - blk_queue_set_zoned(ns->disk, BLK_ZONED_HM); + q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); free_data: kfree(id); return status; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index bf6e0ac9ad28..125dde3f410e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { - if (blkdev_issue_flush(req->ns->bdev)) + if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index cbc88acdd233..b9776fc8f08f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) schedule_work(&req->p.work); } else { rq->end_io_data = req; - blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, + blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, nvmet_passthru_req_done); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 28c04a4efa66..c7eb9a10c680 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -428,15 +428,23 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device) static int dasd_state_ready_to_online(struct dasd_device * device) { + struct gendisk *disk; + struct disk_part_iter piter; + struct block_device *part; + device->state = DASD_STATE_ONLINE; if (device->block) { dasd_schedule_block_bh(device->block); if ((device->features & DASD_FEATURE_USERAW)) { - kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, - KOBJ_CHANGE); + disk = device->block->gdp; + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); return 0; } - disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); + disk = device->block->bdev->bd_disk; + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); + disk_part_iter_exit(&piter); } return 0; } @@ -447,6 +455,9 @@ dasd_state_ready_to_online(struct dasd_device * device) static int dasd_state_online_to_ready(struct dasd_device *device) { int rc; + struct gendisk *disk; + struct disk_part_iter piter; + struct block_device *part; if (device->discipline->online_to_ready) { rc = device->discipline->online_to_ready(device); @@ -455,8 +466,13 @@ static int dasd_state_online_to_ready(struct dasd_device *device) } device->state = DASD_STATE_READY; - if (device->block && !(device->features & DASD_FEATURE_USERAW)) - disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); + if (device->block && !(device->features & DASD_FEATURE_USERAW)) { + disk = device->block->bdev->bd_disk; + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); + disk_part_iter_exit(&piter); + } return 0; } diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index c00f06e9ecb0..f11f51e2465f 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->timeout = 10 * HZ; rq->retries = 5; - blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done); + blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done); } /** diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 4d2280658559..b3f14f05340a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, /* * head injection *required* here otherwise quiesce won't work */ - blk_execute_rq(NULL, req, 1); + blk_execute_rq(req->q, NULL, req, 1); /* * Some devices (USB mass-storage in particular) may transfer diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 03adb39293c2..cf07b7f93579 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -665,28 +665,12 @@ static int sd_zbc_init_disk(struct scsi_disk *sdkp) return 0; } -static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp) +void sd_zbc_release_disk(struct scsi_disk *sdkp) { - /* Serialize against revalidate zones */ - mutex_lock(&sdkp->rev_mutex); - kvfree(sdkp->zones_wp_offset); sdkp->zones_wp_offset = NULL; kfree(sdkp->zone_wp_update_buf); sdkp->zone_wp_update_buf = NULL; - - sdkp->nr_zones = 0; - sdkp->rev_nr_zones = 0; - sdkp->zone_blocks = 0; - sdkp->rev_zone_blocks = 0; - - mutex_unlock(&sdkp->rev_mutex); -} - -void sd_zbc_release_disk(struct scsi_disk *sdkp) -{ - if (sd_is_zoned(sdkp)) - sd_zbc_clear_zone_info(sdkp); } static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) @@ -785,21 +769,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) */ return 0; - /* READ16/WRITE16 is mandatory for ZBC disks */ - sdkp->device->use_16_for_rw = 1; - sdkp->device->use_10_for_rw = 0; - - if (!blk_queue_is_zoned(q)) { - /* - * This can happen for a host aware disk with partitions. - * The block device zone information was already cleared - * by blk_queue_set_zoned(). Only clear the scsi disk zone - * information and exit early. - */ - sd_zbc_clear_zone_info(sdkp); - return 0; - } - /* Check zoned block device characteristics (unconstrained reads) */ ret = sd_zbc_check_zoned_characteristics(sdkp, buf); if (ret) @@ -820,13 +789,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); - /* - * Per ZBC and ZAC specifications, writes in sequential write required - * zones of host-managed devices must be aligned to the device physical - * block size. - */ - if (blk_queue_zoned_model(q) == BLK_ZONED_HM) - blk_queue_zone_write_granularity(q, sdkp->physical_block_size); + /* READ16/WRITE16 is mandatory for ZBC disks */ + sdkp->device->use_16_for_rw = 1; + sdkp->device->use_10_for_rw = 0; sdkp->rev_nr_zones = nr_zones; sdkp->rev_zone_blocks = zone_blocks; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4383d93110f8..bfa8d77322d7 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -829,7 +829,8 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk, + srp->rq, at_head, sg_rq_end_io); return 0; } diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 841ad2fc369a..43f7624508a9 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, rq->retries = retries; req->end_io_data = SRpnt; - blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end); + blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end); return 0; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index cce455929778..b0cb5b95e892 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -241,7 +241,6 @@ struct target_core_file_cmd { unsigned long len; struct se_cmd *cmd; struct kiocb iocb; - struct bio_vec bvecs[]; }; static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -269,22 +268,29 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct target_core_file_cmd *aio_cmd; struct iov_iter iter = {}; struct scatterlist *sg; + struct bio_vec *bvec; ssize_t len = 0; int ret = 0, i; - aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL); + aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL); if (!aio_cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL); + if (!bvec) { + kfree(aio_cmd); + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + } + for_each_sg(sgl, sg, sgl_nents, i) { - aio_cmd->bvecs[i].bv_page = sg_page(sg); - aio_cmd->bvecs[i].bv_len = sg->length; - aio_cmd->bvecs[i].bv_offset = sg->offset; + bvec[i].bv_page = sg_page(sg); + bvec[i].bv_len = sg->length; + bvec[i].bv_offset = sg->offset; len += sg->length; } - iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len); + iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); aio_cmd->cmd = cmd; aio_cmd->len = len; @@ -301,6 +307,8 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, else ret = call_read_iter(file, &aio_cmd->iocb, &iter); + kfree(bvec); + if (ret != -EIOCBQUEUED) cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 33770e5808ce..7994f27e4527 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1000,7 +1000,8 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_OTHER; scsi_req(req)->retries = PS_RETRY; - blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), + blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req, + (cmd->sam_task_attr == TCM_HEAD_TAG), pscsi_req_done); return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index ec26179c8062..235b5042672e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -126,6 +126,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, bd_abort_claiming(bdev, truncate_bdev_range); return 0; } +EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { @@ -423,7 +424,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; @@ -488,10 +489,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; - if (!iov_iter_count(iter)) + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); + if (!nr_pages) return 0; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); @@ -688,7 +688,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev); + error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; @@ -1808,11 +1808,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return error; /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... + * Invalidate again; if someone wandered in and dirtied a page, + * the caller will be given -EBUSY. The third argument is + * inclusive, so the rounding here is safe. */ - return truncate_bdev_range(bdev, file->f_mode, start, end); + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); } const struct file_operations def_blk_fops = { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bc3b33efddc5..b8fab44394f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) * Preallocate a bio that's always going to be used for flushing device * barriers and matches the device lifespan */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); + dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); if (!dev->flush_bio) { kfree(dev); return ERR_PTR(-ENOMEM); diff --git a/fs/direct-io.c b/fs/direct-io.c index 00c419e49f0e..4134e2e19090 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -431,8 +431,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 183ffdf4d43c..a92478eabfa4 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } const struct file_operations exfat_file_operations = { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6e8208acfc62..0a14a7c87bf8 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) - blkdev_issue_flush(journal->j_fs_dev); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { @@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) out: iput(inode); if (!ret) - blkdev_issue_flush(sb->s_bdev); + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); return 0; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..113bfb023a4a 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev); + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index faa912862591..14aef148a968 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1586,7 +1586,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev); + blkdev_issue_flush(sb->s_bdev, GFP_NOFS); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 06fe22fe7b51..81c47b75ff99 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5703,7 +5703,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev); + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index eba1546b5ea1..ffcddabfd11d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -50,6 +50,27 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } +static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, + unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); +} + +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) +{ + if (noio) { + /* No failure on bio allocation */ + return __f2fs_bio_alloc(GFP_NOIO, npages); + } + + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); + return NULL; + } + + return __f2fs_bio_alloc(GFP_KERNEL, npages); +} + static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -383,7 +404,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); + bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -973,9 +994,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES), - &f2fs_bioset); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), + for_write); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 52b7c7d0a351..f0aec024e5df 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,6 +43,7 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -3481,6 +3482,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993004f06a77..440634dfaa56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -563,7 +563,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - int ret = blkdev_issue_flush(bdev); + struct bio *bio; + int ret; + + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + bio_set_dev(bio, bdev); + ret = submit_bio_wait(bio); + bio_put(bio); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4acfa7d36731..30d5abef4361 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -46,6 +46,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", diff --git a/fs/fat/file.c b/fs/fat/file.c index 5fee74f1ad61..f9ee27cf4d7c 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index ca464328b79c..e3da9e96b835 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b9e3db3f855f..807119ae5adf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev); + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4e339bba6afb..25675c1fa44d 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -283,8 +283,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, orig_count = iov_iter_count(dio->submit.iter); iov_iter_truncate(dio->submit.iter, length); - if (!iov_iter_count(dio->submit.iter)) + nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); + if (nr_pages <= 0) { + ret = nr_pages; goto out; + } if (need_zeroout) { /* zero out from the start of the block to the write offset */ @@ -300,7 +303,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); do { size_t n; if (dio->error) { @@ -343,8 +345,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->size += n; copied += n; - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_PAGES); + nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 63b526d44886..472932b9e6bc 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..b121d7d434c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -825,7 +825,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -932,7 +932,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev); + blkdev_issue_flush(journal->j_dev, GFP_NOFS); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 69f18fe20923..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev); + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); if (!err) err = err2; } diff --git a/fs/libfs.c b/fs/libfs.c index abf7674fb437..79721571e014 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1a96ce28efb0..3be6836074ae 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -123,6 +123,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, npg = min(npg, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, npg); + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } + if (bio) { bio->bi_iter.bi_sector = disk_sector; bio_set_dev(bio, bdev); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 1058659a8d31..a07c39c94bbd 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -254,7 +254,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - blk_execute_rq(NULL, rq, 1); + blk_execute_rq(rq->q, NULL, rq, 1); if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", req->result); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1e75417bfe6e..1a8729eded8b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -386,6 +386,10 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, struct bio *bio; bio = bio_alloc(GFP_NOIO, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOIO, nr_vecs); + } if (likely(bio)) { bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 987c8ab02aee..b55cdeb4d169 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev); + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); if (err != -EIO) err = 0; return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index df6d709d2ae3..85979e2214b3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!err) err = ret; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1db0254bc38b..0b641ae694f1 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/splice.c b/fs/splice.c index 5dbce4dcc1a7..b06846f1e6ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -662,14 +662,12 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* build the vector */ left = sd.total_len; - for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { + for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; - /* zero-length bvecs are not supported, skip them */ - if (!this_len) - continue; - this_len = min(this_len, left); + if (this_len > left) + this_len = left; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { @@ -682,7 +680,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, array[n].bv_len = this_len; array[n].bv_offset = buf->offset; left -= this_len; - n++; } iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 586d42342a79..21b1d034aca3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -343,7 +343,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a29653c0196b..0e7ab0bc00ae 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (ret) zonefs_io_error(inode, true); @@ -678,7 +678,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (!nr_pages) return 0; - bio = bio_alloc(GFP_NOFS, nr_pages); + bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); if (!bio) return -ENOMEM; @@ -1581,11 +1581,12 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; /* - * The block size is set to the device zone write granularity to ensure - * that write operations are always aligned according to the device - * interface constraints. + * The block size is set to the device physical sector size to ensure + * that write operations on 512e devices (512B logical block and 4KB + * physical block) are always aligned to the device physical blocks, + * as mandated by the ZBC/ZAC specifications. */ - sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); + sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev)); sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); sbi->s_uid = GLOBAL_ROOT_UID; sbi->s_gid = GLOBAL_ROOT_GID; diff --git a/include/linux/bio.h b/include/linux/bio.h index 5b468f2242ff..77c7d3fd00e7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,7 +10,6 @@ #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include -#include #define BIO_DEBUG @@ -329,6 +328,7 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; + unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ @@ -406,9 +406,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, - struct bio_set *bs); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); +extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -416,11 +414,16 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } +static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); +} + extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); @@ -438,18 +441,6 @@ static inline void bio_wouldblock_error(struct bio *bio) bio_endio(bio); } -/* - * Calculate number of bvec segments that should be allocated to fit data - * pointed by @iter. If @iter is backed by bvec it's going to be reused - * instead of allocating a new one. - */ -static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) -{ - if (iov_iter_is_bvec(iter)) - return 0; - return iov_iter_npages(iter, max_segs); -} - struct request_queue; extern int submit_bio_wait(struct bio *bio); @@ -489,11 +480,13 @@ static inline void zero_fill_bio(struct bio *bio) zero_fill_bio_iter(bio, bio->bi_iter); } +extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); +extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); +extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ - bio_clear_flag(bio, BIO_REMAPPED); \ if ((bio)->bi_bdev != (bdev)) \ bio_clear_flag(bio, BIO_THROTTLED); \ (bio)->bi_bdev = (bdev); \ @@ -502,7 +495,6 @@ do { \ #define bio_copy_dev(dst, src) \ do { \ - bio_clear_flag(dst, BIO_REMAPPED); \ (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) @@ -711,7 +703,6 @@ struct bio_set { mempool_t bvec_integrity_pool; #endif - unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details @@ -722,6 +713,12 @@ struct bio_set { struct workqueue_struct *rescue_workqueue; }; +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index aabbf6830ffc..6b410dab48ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,10 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; + /** + * @elevator_queued: Number of queued requests on hctx. + */ + atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9ec28fcd3bcc..4667720ca095 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -227,7 +227,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* BIO_* below */ + unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; @@ -306,10 +306,36 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ - BIO_REMAPPED, BIO_FLAG_LAST }; +/* See BVEC_POOL_OFFSET below before adding new flags */ + +/* + * We support 6 different bvec pools, the last one is magic in that it + * is backed by a mempool. + */ +#define BVEC_POOL_NR 6 +#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) + +/* + * Top 3 bits of bio flags indicate the pool the bvecs came from. We add + * 1 to the actual index so that 0 indicates that there are no bvecs to be + * freed. + */ +#define BVEC_POOL_BITS (3) +#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) +#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) +#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) +# error "BVEC_POOL_BITS is too small" +#endif + +/* + * Flags starting here get preserved by bio_reset() - this includes + * only BVEC_POOL_IDX() + */ +#define BIO_RESET_BITS BVEC_POOL_OFFSET + typedef __u32 __bitwise blk_mq_req_flags_t; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9149f4a5adb3..b55bd534b2e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -337,7 +337,6 @@ struct queue_limits { unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; - unsigned int zone_write_granularity; unsigned short max_segments; unsigned short max_integrity_segments; @@ -949,8 +948,9 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct gendisk *, struct request *, int); -extern void blk_execute_rq_nowait(struct gendisk *, +extern void blk_execute_rq(struct request_queue *, struct gendisk *, + struct request *, int); +extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); /* Helper to convert REQ_OP_XXX to its string format XXX */ @@ -1161,8 +1161,6 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); @@ -1291,7 +1289,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } -int blkdev_issue_flush(struct block_device *bdev); +int blkdev_issue_flush(struct block_device *, gfp_t); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { @@ -1319,7 +1317,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev) +static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { return 0; } @@ -1476,18 +1474,6 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } -static inline unsigned int -queue_zone_write_granularity(const struct request_queue *q) -{ - return q->limits.zone_write_granularity; -} - -static inline unsigned int -bdev_zone_write_granularity(struct block_device *bdev) -{ - return queue_zone_write_granularity(bdev_get_queue(bdev)); -} - static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) @@ -1968,9 +1954,22 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long bio_start_io_acct(struct bio *bio); -void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, - struct block_device *orig_bdev); +unsigned long part_start_io_acct(struct gendisk *disk, + struct block_device **part, struct bio *bio); +void part_end_io_acct(struct block_device *part, struct bio *bio, + unsigned long start_time); + +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +static inline unsigned long bio_start_io_acct(struct bio *bio) +{ + return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), + bio_op(bio)); +} /** * bio_end_io_acct - end I/O accounting for bio based drivers @@ -1979,7 +1978,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); + return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); } int bdev_read_only(struct block_device *bdev); @@ -2014,16 +2013,21 @@ void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } +static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + return 0; +} static inline int sync_blockdev(struct block_device *bdev) { return 0; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1fe8e105b83b..bacc40a0bdf3 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -172,8 +172,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); /* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) -/* Supports scheduling on multiple hardware queues */ -#define ELEVATOR_F_MQ_AWARE (1U << 1) #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f364619092cc..a62ccbfac54b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -32,7 +32,6 @@ extern struct class block_class; #include #include #include -#include #define PARTITION_META_INFO_VOLNAMELTH 64 /* @@ -117,6 +116,13 @@ enum { DISK_EVENT_FLAG_UEVENT = 1 << 1, }; +struct disk_part_tbl { + struct rcu_head rcu_head; + int len; + struct block_device __rcu *last_lookup; + struct block_device __rcu *part[]; +}; + struct disk_events; struct badblocks; @@ -142,7 +148,12 @@ struct gendisk { unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ - struct xarray part_tbl; + /* Array of pointers to partitions indexed by partno. + * Protected with matching bdev lock but stat and other + * non-critical accesses use RCU. Always access through + * helpers. + */ + struct disk_part_tbl __rcu *part_tbl; struct block_device *part0; const struct block_device_operations *fops; @@ -202,11 +213,10 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } -void disk_uevent(struct gendisk *disk, enum kobject_action action); - /* * Smarter partition iterator without context limits. */ +#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ #define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ @@ -214,7 +224,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); struct disk_part_iter { struct gendisk *disk; struct block_device *part; - unsigned long idx; + int idx; unsigned int flags; }; @@ -222,6 +232,7 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); +extern bool disk_has_partitions(struct gendisk *disk); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, diff --git a/include/linux/swap.h b/include/linux/swap.h index 3f1f7ae0fbe9..596bc2f4d9b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -468,6 +468,7 @@ extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); +extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d8ca336ff9cd..f0b2ccb1bb01 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -72,6 +72,8 @@ __start.bi_bvec_done = skip; \ __start.bi_idx = 0; \ for_each_bvec(__v, i->bvec, __bi, __start) { \ + if (!__v.bv_len) \ + continue; \ (void)(STEP); \ } \ } @@ -1067,21 +1069,6 @@ static void pipe_advance(struct iov_iter *i, size_t size) pipe_truncate(i); } -static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) -{ - struct bvec_iter bi; - - bi.bi_size = i->count; - bi.bi_bvec_done = i->iov_offset; - bi.bi_idx = 0; - bvec_iter_advance(i->bvec, &bi, size); - - i->bvec += bi.bi_idx; - i->nr_segs -= bi.bi_idx; - i->count = bi.bi_size; - i->iov_offset = bi.bi_bvec_done; -} - void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(iov_iter_is_pipe(i))) { @@ -1092,10 +1079,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; return; } - if (iov_iter_is_bvec(i)) { - iov_iter_bvec_advance(i, size); - return; - } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); diff --git a/mm/page_io.c b/mm/page_io.c index 92f7941c6d01..a75f35464a4e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,25 @@ #include #include +static struct bio *get_swap_bio(gfp_t gfp_flags, + struct page *page, bio_end_io_t end_io) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, 1); + if (bio) { + struct block_device *bdev; + + bio->bi_iter.bi_sector = map_swap_page(page, &bdev); + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; + bio->bi_end_io = end_io; + + bio_add_page(bio, page, thp_size(page), 0); + } + return bio; +} + void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -342,13 +361,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = bio_alloc(GFP_NOIO, 1); - bio_set_dev(bio, sis->bdev); - bio->bi_iter.bi_sector = swap_page_sector(page); + bio = get_swap_bio(GFP_NOIO, page, end_write_func); + if (bio == NULL) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio->bi_end_io = end_write_func; - bio_add_page(bio, page, thp_size(page), 0); - bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); @@ -408,18 +427,18 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, sis->bdev); - bio->bi_opf = REQ_OP_READ; - bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_swap_bio_read; - bio_add_page(bio, page, thp_size(page), 0); - + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); + if (bio == NULL) { + unlock_page(page); + ret = -ENOMEM; + goto out; + } disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ + bio_set_op_attrs(bio, REQ_OP_READ, 0); if (synchronous) { bio->bi_opf |= REQ_HIPRI; get_task_struct(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 21a98cb8d646..9fffc5af29d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,6 +47,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1849,13 +1850,12 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { + struct block_device *bdev; struct swap_info_struct *si = swap_type_to_swap_info(type); - struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; - se = offset_to_swap_extent(si, offset); - return se->start_block + (offset - se->start_page); + return map_swap_entry(swp_entry(type, offset), &bdev); } /* @@ -2281,6 +2281,36 @@ static void drain_mmlist(void) spin_unlock(&mmlist_lock); } +/* + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. + */ +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) +{ + struct swap_info_struct *sis; + struct swap_extent *se; + pgoff_t offset; + + sis = swp_swap_info(entry); + *bdev = sis->bdev; + + offset = swp_offset(entry); + se = offset_to_swap_extent(sis, offset); + return se->start_block + (offset - se->start_page); +} + +/* + * Returns the page offset into bdev for the specified page's swap entry. + */ +sector_t map_swap_page(struct page *page, struct block_device **bdev) +{ + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); +} + /* * Free all of a swapdev's extent information */