From 47c1260ea822d44117cb091ded290719c1ea05e1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Wed, 3 Mar 2021 10:37:27 +0100
Subject: [PATCH] ANDROID: Revert "Merge 582cd91f69de ("Merge tag
 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into
 android-mainline"

This reverts commit f858d3a02f88 ("Merge 582cd91f69de ("Merge tag
'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into
android-mainline") as it is causing boot problems with md devices in the
android build systems.

Reverting this should allow us to continue on with the merges of other
non-block issues, in order to be able to focus on tracking this issue
down at a later time.

Fixes: f858d3a02f88 ("Merge 582cd91f69de ("Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block") into android-mainline")
Bug: 181742070
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I03cc2f86827f2d18db88fc68a4128747430f40d4
---
 Documentation/block/biovecs.rst       |   2 -
 Documentation/block/queue-sysfs.rst   |  13 -
 Documentation/filesystems/f2fs.rst    |   1 +
 Documentation/filesystems/porting.rst |  16 -
 block/bfq-iosched.c                   | 445 ++++++++-------------
 block/bfq-iosched.h                   |  29 +-
 block/bfq-wf2q.c                      |   3 +
 block/bio-integrity.c                 |  17 +-
 block/bio.c                           | 554 ++++++++++++++------------
 block/blk-cgroup.c                    |  15 +-
 block/blk-core.c                      |  43 +-
 block/blk-crypto-fallback.c           |   4 +-
 block/blk-exec.c                      |  14 +-
 block/blk-flush.c                     |  17 +-
 block/blk-mq.c                        |  67 +---
 block/blk-settings.c                  |  41 +-
 block/blk-sysfs.c                     |   8 -
 block/blk-wbt.c                       |   4 +-
 block/blk-zoned.c                     |  17 -
 block/blk.h                           |  10 +-
 block/bounce.c                        |   2 -
 block/bsg.c                           |   6 +-
 block/genhd.c                         | 275 +++++++++++--
 block/kyber-iosched.c                 |   1 -
 block/mq-deadline.c                   |   6 +
 block/partitions/core.c               |  33 +-
 block/scsi_ioctl.c                    |   6 +-
 drivers/block/drbd/drbd_actlog.c      |   2 +-
 drivers/block/drbd/drbd_bitmap.c      |   2 +-
 drivers/block/drbd/drbd_int.h         |   2 +
 drivers/block/drbd/drbd_main.c        |  13 +
 drivers/block/drbd/drbd_req.c         |   5 +-
 drivers/block/drbd/drbd_req.h         |  12 +
 drivers/block/drbd/drbd_worker.c      |   5 +-
 drivers/block/mtip32xx/mtip32xx.c     |   2 +-
 drivers/block/null_blk/zoned.c        |   8 +-
 drivers/block/paride/pd.c             |   2 +-
 drivers/block/pktcdvd.c               |   2 +-
 drivers/block/sx8.c                   |   4 +-
 drivers/block/virtio_blk.c            |   2 +-
 drivers/cdrom/cdrom.c                 |   2 +-
 drivers/ide/ide-atapi.c               |   2 +-
 drivers/ide/ide-cd.c                  |   2 +-
 drivers/ide/ide-cd_ioctl.c            |   2 +-
 drivers/ide/ide-devsets.c             |   2 +-
 drivers/ide/ide-disk.c                |   2 +-
 drivers/ide/ide-ioctls.c              |   4 +-
 drivers/ide/ide-park.c                |   2 +-
 drivers/ide/ide-pm.c                  |   4 +-
 drivers/ide/ide-tape.c                |   2 +-
 drivers/ide/ide-taskfile.c            |   2 +-
 drivers/md/bcache/debug.c             |   2 +-
 drivers/md/bcache/request.c           |  34 +-
 drivers/md/bcache/super.c             |   2 +-
 drivers/md/dm-clone-target.c          |  14 +-
 drivers/md/dm-zoned-metadata.c        |   6 +-
 drivers/md/md.c                       |  71 ++--
 drivers/md/md.h                       |   2 +
 drivers/md/raid1.c                    |   2 +-
 drivers/md/raid10.c                   |   6 +-
 drivers/md/raid5-ppl.c                |   2 +-
 drivers/md/raid5.c                    | 108 ++---
 drivers/mmc/core/block.c              |  10 +-
 drivers/nvme/host/core.c              |  22 +-
 drivers/nvme/host/lightnvm.c          |   6 +-
 drivers/nvme/host/multipath.c         |   4 +-
 drivers/nvme/host/pci.c               |   4 +-
 drivers/nvme/host/zns.c               |  11 +-
 drivers/nvme/target/io-cmd-bdev.c     |   2 +-
 drivers/nvme/target/passthru.c        |   2 +-
 drivers/s390/block/dasd.c             |  26 +-
 drivers/scsi/scsi_error.c             |   2 +-
 drivers/scsi/scsi_lib.c               |   2 +-
 drivers/scsi/sd_zbc.c                 |  43 +-
 drivers/scsi/sg.c                     |   3 +-
 drivers/scsi/st.c                     |   2 +-
 drivers/target/target_core_file.c     |  20 +-
 drivers/target/target_core_pscsi.c    |   3 +-
 fs/block_dev.c                        |  20 +-
 fs/btrfs/volumes.c                    |   2 +-
 fs/direct-io.c                        |   2 -
 fs/exfat/file.c                       |   2 +-
 fs/ext4/fast_commit.c                 |   4 +-
 fs/ext4/fsync.c                       |   2 +-
 fs/ext4/ialloc.c                      |   2 +-
 fs/ext4/super.c                       |   2 +-
 fs/f2fs/data.c                        |  28 +-
 fs/f2fs/f2fs.h                        |   2 +
 fs/f2fs/segment.c                     |  12 +-
 fs/f2fs/super.c                       |   1 +
 fs/fat/file.c                         |   2 +-
 fs/hfsplus/inode.c                    |   2 +-
 fs/hfsplus/super.c                    |   2 +-
 fs/iomap/direct-io.c                  |   9 +-
 fs/jbd2/checkpoint.c                  |   2 +-
 fs/jbd2/commit.c                      |   4 +-
 fs/jbd2/recovery.c                    |   2 +-
 fs/libfs.c                            |   2 +-
 fs/nfs/blocklayout/blocklayout.c      |   5 +
 fs/nfsd/blocklayout.c                 |   2 +-
 fs/nilfs2/segbuf.c                    |   4 +
 fs/nilfs2/the_nilfs.h                 |   2 +-
 fs/ocfs2/file.c                       |   2 +-
 fs/reiserfs/file.c                    |   2 +-
 fs/splice.c                           |   9 +-
 fs/xfs/xfs_super.c                    |   2 +-
 fs/zonefs/super.c                     |  13 +-
 include/linux/bio.h                   |  37 +-
 include/linux/blk-mq.h                |   4 +
 include/linux/blk_types.h             |  30 +-
 include/linux/blkdev.h                |  54 +--
 include/linux/elevator.h              |   2 -
 include/linux/genhd.h                 |  21 +-
 include/linux/swap.h                  |   1 +
 lib/iov_iter.c                        |  21 +-
 mm/page_io.c                          |  45 ++-
 mm/swapfile.c                         |  36 +-
 117 files changed, 1370 insertions(+), 1173 deletions(-)

diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
index ddb867e0185b..36771a131b56 100644
--- a/Documentation/block/biovecs.rst
+++ b/Documentation/block/biovecs.rst
@@ -40,8 +40,6 @@ normal code doesn't have to deal with bi_bvec_done.
    There is a lower level advance function - bvec_iter_advance() - which takes
    a pointer to a biovec, not a bio; this is used by the bio integrity code.
 
-As of 5.12 bvec segments with zero bv_len are not supported.
-
 What's all this get us?
 =======================
 
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 4dc7f0d499a8..2638d3446b79 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -261,12 +261,6 @@ For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of
 bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES
 is not supported.
 
-zone_append_max_bytes (RO)
---------------------------
-This is the maximum number of bytes that can be written to a sequential
-zone of a zoned block device using a zone append write operation
-(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices.
-
 zoned (RO)
 ----------
 This indicates if the device is a zoned block device and the zone model of the
@@ -279,11 +273,4 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
 do not support zone commands, they will be treated as regular block devices
 and zoned will report "none".
 
-zone_write_granularity (RO)
----------------------------
-This indicates the alignment constraint, in bytes, for write operations in
-sequential zones of zoned block devices (devices with a zoned attributed
-that reports "host-managed" or "host-aware"). This value is always 0 for
-regular block devices.
-
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 35ed01a5fbc9..81c05baa8312 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -179,6 +179,7 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 FAULT_KVMALLOC		  0x000000002
 			 FAULT_PAGE_ALLOC	  0x000000004
 			 FAULT_PAGE_GET		  0x000000008
+			 FAULT_ALLOC_BIO	  0x000000010
 			 FAULT_ALLOC_NID	  0x000000020
 			 FAULT_ORPHAN		  0x000000040
 			 FAULT_BLOCK		  0x000000080
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 1f8cf8e10b34..867036aa90b8 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -865,19 +865,3 @@ no matter what.  Everything is handled by the caller.
 
 clone_private_mount() returns a longterm mount now, so the proper destructor of
 its result is kern_unmount() or kern_unmount_array().
-
----
-
-**mandatory**
-
-zero-length bvec segments are disallowed, they must be filtered out before
-passed on to an iterator.
-
----
-
-**mandatory**
-
-For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
-uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
-page references stay until I/O has completed, i.e. until ->ki_complete() has
-been called or returned with non -EIOCBQUEUED code.
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index b398dde53af9..9e81d1052091 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -158,6 +158,7 @@ BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
 #undef BFQ_BFQQ_FNS						\
 
 /* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1023,16 +1024,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 	else
 		bfq_clear_bfqq_IO_bound(bfqq);
 
-	bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
-	bfqq->inject_limit = bic->saved_inject_limit;
-	bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
-
 	bfqq->entity.new_weight = bic->saved_weight;
 	bfqq->ttime = bic->saved_ttime;
-	bfqq->io_start_time = bic->saved_io_start_time;
-	bfqq->tot_idle_time = bic->saved_tot_idle_time;
 	bfqq->wr_coeff = bic->saved_wr_coeff;
-	bfqq->service_from_wr = bic->saved_service_from_wr;
 	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
 	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
 	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
@@ -1653,8 +1647,6 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
 	return bfqq_weight > in_serv_weight;
 }
 
-static bool bfq_better_to_idle(struct bfq_queue *bfqq);
-
 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 					     struct bfq_queue *bfqq,
 					     int old_wr_coeff,
@@ -1679,19 +1671,15 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	 * - it is sync,
 	 * - it does not belong to a large burst,
 	 * - it has been idle for enough time or is soft real-time,
-	 * - is linked to a bfq_io_cq (it is not shared in any sense),
-	 * - has a default weight (otherwise we assume the user wanted
-	 *   to control its weight explicitly)
+	 * - is linked to a bfq_io_cq (it is not shared in any sense).
 	 */
 	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
 		!BFQQ_TOTALLY_SEEKY(bfqq) &&
 		!in_burst &&
 		time_is_before_jiffies(bfqq->soft_rt_next_start) &&
-		bfqq->dispatched == 0 &&
-		bfqq->entity.new_weight == 40;
-	*interactive = !in_burst && idle_for_long_time &&
-		bfqq->entity.new_weight == 40;
+		bfqq->dispatched == 0;
+	*interactive = !in_burst && idle_for_long_time;
 	wr_or_deserves_wr = bfqd->low_latency &&
 		(bfqq->wr_coeff > 1 ||
 		 (bfq_bfqq_sync(bfqq) &&
@@ -1729,6 +1717,17 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 
 	bfq_clear_bfqq_just_created(bfqq);
 
+
+	if (!bfq_bfqq_IO_bound(bfqq)) {
+		if (arrived_in_time) {
+			bfqq->requests_within_timer++;
+			if (bfqq->requests_within_timer >=
+			    bfqd->bfq_requests_within_timer)
+				bfq_mark_bfqq_IO_bound(bfqq);
+		} else
+			bfqq->requests_within_timer = 0;
+	}
+
 	if (bfqd->low_latency) {
 		if (unlikely(time_is_after_jiffies(bfqq->split_time)))
 			/* wraparound */
@@ -1756,10 +1755,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	bfq_add_bfqq_busy(bfqd, bfqq);
 
 	/*
-	 * Expire in-service queue if preemption may be needed for
-	 * guarantees or throughput. As for guarantees, we care
-	 * explicitly about two cases. The first is that bfqq has to
-	 * recover a service hole, as explained in the comments on
+	 * Expire in-service queue only if preemption may be needed
+	 * for guarantees. In particular, we care only about two
+	 * cases. The first is that bfqq has to recover a service
+	 * hole, as explained in the comments on
 	 * bfq_bfqq_update_budg_for_activation(), i.e., that
 	 * bfqq_wants_to_preempt is true. However, if bfqq does not
 	 * carry time-critical I/O, then bfqq's bandwidth is less
@@ -1786,23 +1785,11 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	 * timestamps of the in-service queue would need to be
 	 * updated, and this operation is quite costly (see the
 	 * comments on bfq_bfqq_update_budg_for_activation()).
-	 *
-	 * As for throughput, we ask bfq_better_to_idle() whether we
-	 * still need to plug I/O dispatching. If bfq_better_to_idle()
-	 * says no, then plugging is not needed any longer, either to
-	 * boost throughput or to perserve service guarantees. Then
-	 * the best option is to stop plugging I/O, as not doing so
-	 * would certainly lower throughput. We may end up in this
-	 * case if: (1) upon a dispatch attempt, we detected that it
-	 * was better to plug I/O dispatch, and to wait for a new
-	 * request to arrive for the currently in-service queue, but
-	 * (2) this switch of bfqq to busy changes the scenario.
 	 */
 	if (bfqd->in_service_queue &&
 	    ((bfqq_wants_to_preempt &&
 	      bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
-	     bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) ||
-	     !bfq_better_to_idle(bfqd->in_service_queue)) &&
+	     bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
 	    next_queue_may_preempt(bfqd))
 		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
 				false, BFQQE_PREEMPTED);
@@ -1874,138 +1861,6 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd,
 	bfqq->decrease_time_jif = jiffies;
 }
 
-static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
-{
-	u64 tot_io_time = now_ns - bfqq->io_start_time;
-
-	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0)
-		bfqq->tot_idle_time +=
-			now_ns - bfqq->ttime.last_end_request;
-
-	if (unlikely(bfq_bfqq_just_created(bfqq)))
-		return;
-
-	/*
-	 * Must be busy for at least about 80% of the time to be
-	 * considered I/O bound.
-	 */
-	if (bfqq->tot_idle_time * 5 > tot_io_time)
-		bfq_clear_bfqq_IO_bound(bfqq);
-	else
-		bfq_mark_bfqq_IO_bound(bfqq);
-
-	/*
-	 * Keep an observation window of at most 200 ms in the past
-	 * from now.
-	 */
-	if (tot_io_time > 200 * NSEC_PER_MSEC) {
-		bfqq->io_start_time = now_ns - (tot_io_time>>1);
-		bfqq->tot_idle_time >>= 1;
-	}
-}
-
-/*
- * Detect whether bfqq's I/O seems synchronized with that of some
- * other queue, i.e., whether bfqq, after remaining empty, happens to
- * receive new I/O only right after some I/O request of the other
- * queue has been completed. We call waker queue the other queue, and
- * we assume, for simplicity, that bfqq may have at most one waker
- * queue.
- *
- * A remarkable throughput boost can be reached by unconditionally
- * injecting the I/O of the waker queue, every time a new
- * bfq_dispatch_request happens to be invoked while I/O is being
- * plugged for bfqq.  In addition to boosting throughput, this
- * unblocks bfqq's I/O, thereby improving bandwidth and latency for
- * bfqq. Note that these same results may be achieved with the general
- * injection mechanism, but less effectively. For details on this
- * aspect, see the comments on the choice of the queue for injection
- * in bfq_select_queue().
- *
- * Turning back to the detection of a waker queue, a queue Q is deemed
- * as a waker queue for bfqq if, for three consecutive times, bfqq
- * happens to become non empty right after a request of Q has been
- * completed. In particular, on the first time, Q is tentatively set
- * as a candidate waker queue, while on the third consecutive time
- * that Q is detected, the field waker_bfqq is set to Q, to confirm
- * that Q is a waker queue for bfqq. These detection steps are
- * performed only if bfqq has a long think time, so as to make it more
- * likely that bfqq's I/O is actually being blocked by a
- * synchronization. This last filter, plus the above three-times
- * requirement, make false positives less likely.
- *
- * NOTE
- *
- * The sooner a waker queue is detected, the sooner throughput can be
- * boosted by injecting I/O from the waker queue. Fortunately,
- * detection is likely to be actually fast, for the following
- * reasons. While blocked by synchronization, bfqq has a long think
- * time. This implies that bfqq's inject limit is at least equal to 1
- * (see the comments in bfq_update_inject_limit()). So, thanks to
- * injection, the waker queue is likely to be served during the very
- * first I/O-plugging time interval for bfqq. This triggers the first
- * step of the detection mechanism. Thanks again to injection, the
- * candidate waker queue is then likely to be confirmed no later than
- * during the next I/O-plugging interval for bfqq.
- *
- * ISSUE
- *
- * On queue merging all waker information is lost.
- */
-static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			    u64 now_ns)
-{
-	if (!bfqd->last_completed_rq_bfqq ||
-	    bfqd->last_completed_rq_bfqq == bfqq ||
-	    bfq_bfqq_has_short_ttime(bfqq) ||
-	    now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
-	    bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
-		return;
-
-	if (bfqd->last_completed_rq_bfqq !=
-	    bfqq->tentative_waker_bfqq) {
-		/*
-		 * First synchronization detected with a
-		 * candidate waker queue, or with a different
-		 * candidate waker queue from the current one.
-		 */
-		bfqq->tentative_waker_bfqq =
-			bfqd->last_completed_rq_bfqq;
-		bfqq->num_waker_detections = 1;
-	} else /* Same tentative waker queue detected again */
-		bfqq->num_waker_detections++;
-
-	if (bfqq->num_waker_detections == 3) {
-		bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
-		bfqq->tentative_waker_bfqq = NULL;
-
-		/*
-		 * If the waker queue disappears, then
-		 * bfqq->waker_bfqq must be reset. To
-		 * this goal, we maintain in each
-		 * waker queue a list, woken_list, of
-		 * all the queues that reference the
-		 * waker queue through their
-		 * waker_bfqq pointer. When the waker
-		 * queue exits, the waker_bfqq pointer
-		 * of all the queues in the woken_list
-		 * is reset.
-		 *
-		 * In addition, if bfqq is already in
-		 * the woken_list of a waker queue,
-		 * then, before being inserted into
-		 * the woken_list of a new waker
-		 * queue, bfqq must be removed from
-		 * the woken_list of the old waker
-		 * queue.
-		 */
-		if (!hlist_unhashed(&bfqq->woken_list_node))
-			hlist_del_init(&bfqq->woken_list_node);
-		hlist_add_head(&bfqq->woken_list_node,
-			       &bfqd->last_completed_rq_bfqq->woken_list);
-	}
-}
-
 static void bfq_add_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -2013,14 +1868,117 @@ static void bfq_add_request(struct request *rq)
 	struct request *next_rq, *prev;
 	unsigned int old_wr_coeff = bfqq->wr_coeff;
 	bool interactive = false;
-	u64 now_ns = ktime_get_ns();
 
 	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
 	bfqq->queued[rq_is_sync(rq)]++;
 	bfqd->queued++;
 
 	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
-		bfq_check_waker(bfqd, bfqq, now_ns);
+		/*
+		 * Detect whether bfqq's I/O seems synchronized with
+		 * that of some other queue, i.e., whether bfqq, after
+		 * remaining empty, happens to receive new I/O only
+		 * right after some I/O request of the other queue has
+		 * been completed. We call waker queue the other
+		 * queue, and we assume, for simplicity, that bfqq may
+		 * have at most one waker queue.
+		 *
+		 * A remarkable throughput boost can be reached by
+		 * unconditionally injecting the I/O of the waker
+		 * queue, every time a new bfq_dispatch_request
+		 * happens to be invoked while I/O is being plugged
+		 * for bfqq.  In addition to boosting throughput, this
+		 * unblocks bfqq's I/O, thereby improving bandwidth
+		 * and latency for bfqq. Note that these same results
+		 * may be achieved with the general injection
+		 * mechanism, but less effectively. For details on
+		 * this aspect, see the comments on the choice of the
+		 * queue for injection in bfq_select_queue().
+		 *
+		 * Turning back to the detection of a waker queue, a
+		 * queue Q is deemed as a waker queue for bfqq if, for
+		 * two consecutive times, bfqq happens to become non
+		 * empty right after a request of Q has been
+		 * completed. In particular, on the first time, Q is
+		 * tentatively set as a candidate waker queue, while
+		 * on the second time, the flag
+		 * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
+		 * is a waker queue for bfqq. These detection steps
+		 * are performed only if bfqq has a long think time,
+		 * so as to make it more likely that bfqq's I/O is
+		 * actually being blocked by a synchronization. This
+		 * last filter, plus the above two-times requirement,
+		 * make false positives less likely.
+		 *
+		 * NOTE
+		 *
+		 * The sooner a waker queue is detected, the sooner
+		 * throughput can be boosted by injecting I/O from the
+		 * waker queue. Fortunately, detection is likely to be
+		 * actually fast, for the following reasons. While
+		 * blocked by synchronization, bfqq has a long think
+		 * time. This implies that bfqq's inject limit is at
+		 * least equal to 1 (see the comments in
+		 * bfq_update_inject_limit()). So, thanks to
+		 * injection, the waker queue is likely to be served
+		 * during the very first I/O-plugging time interval
+		 * for bfqq. This triggers the first step of the
+		 * detection mechanism. Thanks again to injection, the
+		 * candidate waker queue is then likely to be
+		 * confirmed no later than during the next
+		 * I/O-plugging interval for bfqq.
+		 */
+		if (bfqd->last_completed_rq_bfqq &&
+		    !bfq_bfqq_has_short_ttime(bfqq) &&
+		    ktime_get_ns() - bfqd->last_completion <
+		    200 * NSEC_PER_USEC) {
+			if (bfqd->last_completed_rq_bfqq != bfqq &&
+			    bfqd->last_completed_rq_bfqq !=
+			    bfqq->waker_bfqq) {
+				/*
+				 * First synchronization detected with
+				 * a candidate waker queue, or with a
+				 * different candidate waker queue
+				 * from the current one.
+				 */
+				bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+
+				/*
+				 * If the waker queue disappears, then
+				 * bfqq->waker_bfqq must be reset. To
+				 * this goal, we maintain in each
+				 * waker queue a list, woken_list, of
+				 * all the queues that reference the
+				 * waker queue through their
+				 * waker_bfqq pointer. When the waker
+				 * queue exits, the waker_bfqq pointer
+				 * of all the queues in the woken_list
+				 * is reset.
+				 *
+				 * In addition, if bfqq is already in
+				 * the woken_list of a waker queue,
+				 * then, before being inserted into
+				 * the woken_list of a new waker
+				 * queue, bfqq must be removed from
+				 * the woken_list of the old waker
+				 * queue.
+				 */
+				if (!hlist_unhashed(&bfqq->woken_list_node))
+					hlist_del_init(&bfqq->woken_list_node);
+				hlist_add_head(&bfqq->woken_list_node,
+				    &bfqd->last_completed_rq_bfqq->woken_list);
+
+				bfq_clear_bfqq_has_waker(bfqq);
+			} else if (bfqd->last_completed_rq_bfqq ==
+				   bfqq->waker_bfqq &&
+				   !bfq_bfqq_has_waker(bfqq)) {
+				/*
+				 * synchronization with waker_bfqq
+				 * seen for the second time
+				 */
+				bfq_mark_bfqq_has_waker(bfqq);
+			}
+		}
 
 		/*
 		 * Periodically reset inject limit, to make sure that
@@ -2089,9 +2047,6 @@ static void bfq_add_request(struct request *rq)
 		}
 	}
 
-	if (bfq_bfqq_sync(bfqq))
-		bfq_update_io_intensity(bfqq, now_ns);
-
 	elv_rb_add(&bfqq->sort_list, rq);
 
 	/*
@@ -2397,24 +2352,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 /* Must be called with bfqq != NULL */
 static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 {
-	/*
-	 * If bfqq has been enjoying interactive weight-raising, then
-	 * reset soft_rt_next_start. We do it for the following
-	 * reason. bfqq may have been conveying the I/O needed to load
-	 * a soft real-time application. Such an application actually
-	 * exhibits a soft real-time I/O pattern after it finishes
-	 * loading, and finally starts doing its job. But, if bfqq has
-	 * been receiving a lot of bandwidth so far (likely to happen
-	 * on a fast device), then soft_rt_next_start now contains a
-	 * high value that. So, without this reset, bfqq would be
-	 * prevented from being possibly considered as soft_rt for a
-	 * very long time.
-	 */
-
-	if (bfqq->wr_cur_max_time !=
-	    bfqq->bfqd->bfq_wr_rt_max_time)
-		bfqq->soft_rt_next_start = jiffies;
-
 	if (bfq_bfqq_busy(bfqq))
 		bfqq->bfqd->wr_busy_queues--;
 	bfqq->wr_coeff = 1;
@@ -2749,16 +2686,10 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	if (!bic)
 		return;
 
-	bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
-	bic->saved_inject_limit = bfqq->inject_limit;
-	bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
-
 	bic->saved_weight = bfqq->entity.orig_weight;
 	bic->saved_ttime = bfqq->ttime;
 	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
 	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
-	bic->saved_io_start_time = bfqq->io_start_time;
-	bic->saved_tot_idle_time = bfqq->tot_idle_time;
 	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
 	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
 	if (unlikely(bfq_bfqq_just_created(bfqq) &&
@@ -2781,7 +2712,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 		bic->saved_wr_coeff = bfqq->wr_coeff;
 		bic->saved_wr_start_at_switch_to_srt =
 			bfqq->wr_start_at_switch_to_srt;
-		bic->saved_service_from_wr = bfqq->service_from_wr;
 		bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
 		bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
 	}
@@ -3007,7 +2937,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 	}
 
 	bfqd->in_service_queue = bfqq;
-	bfqd->in_serv_last_pos = 0;
 }
 
 /*
@@ -3513,38 +3442,20 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
  * order until all the requests already queued in the device have been
  * served. The last sub-condition commented above somewhat mitigates
  * this problem for weight-raised queues.
- *
- * However, as an additional mitigation for this problem, we preserve
- * plugging for a special symmetric case that may suddenly turn into
- * asymmetric: the case where only bfqq is busy. In this case, not
- * expiring bfqq does not cause any harm to any other queues in terms
- * of service guarantees. In contrast, it avoids the following unlucky
- * sequence of events: (1) bfqq is expired, (2) a new queue with a
- * lower weight than bfqq becomes busy (or more queues), (3) the new
- * queue is served until a new request arrives for bfqq, (4) when bfqq
- * is finally served, there are so many requests of the new queue in
- * the drive that the pending requests for bfqq take a lot of time to
- * be served. In particular, event (2) may case even already
- * dispatched requests of bfqq to be delayed, inside the drive. So, to
- * avoid this series of events, the scenario is preventively declared
- * as asymmetric also if bfqq is the only busy queues
  */
 static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
 						 struct bfq_queue *bfqq)
 {
-	int tot_busy_queues = bfq_tot_busy_queues(bfqd);
-
 	/* No point in idling for bfqq if it won't get requests any longer */
 	if (unlikely(!bfqq_process_refs(bfqq)))
 		return false;
 
 	return (bfqq->wr_coeff > 1 &&
 		(bfqd->wr_busy_queues <
-		 tot_busy_queues ||
+		 bfq_tot_busy_queues(bfqd) ||
 		 bfqd->rq_in_driver >=
 		 bfqq->dispatched + 4)) ||
-		bfq_asymmetric_scenario(bfqd, bfqq) ||
-		tot_busy_queues == 1;
+		bfq_asymmetric_scenario(bfqd, bfqq);
 }
 
 static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -4028,6 +3939,10 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
 		bfq_bfqq_charge_time(bfqd, bfqq, delta);
 
+	if (reason == BFQQE_TOO_IDLE &&
+	    entity->service <= 2 * entity->budget / 10)
+		bfq_clear_bfqq_IO_bound(bfqq);
+
 	if (bfqd->low_latency && bfqq->wr_coeff == 1)
 		bfqq->last_wr_start_finish = jiffies;
 
@@ -4037,15 +3952,30 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 		 * If we get here, and there are no outstanding
 		 * requests, then the request pattern is isochronous
 		 * (see the comments on the function
-		 * bfq_bfqq_softrt_next_start()). Therefore we can
-		 * compute soft_rt_next_start.
+		 * bfq_bfqq_softrt_next_start()). Thus we can compute
+		 * soft_rt_next_start. And we do it, unless bfqq is in
+		 * interactive weight raising. We do not do it in the
+		 * latter subcase, for the following reason. bfqq may
+		 * be conveying the I/O needed to load a soft
+		 * real-time application. Such an application will
+		 * actually exhibit a soft real-time I/O pattern after
+		 * it finally starts doing its job. But, if
+		 * soft_rt_next_start is computed here for an
+		 * interactive bfqq, and bfqq had received a lot of
+		 * service before remaining with no outstanding
+		 * request (likely to happen on a fast device), then
+		 * soft_rt_next_start would be assigned such a high
+		 * value that, for a very long time, bfqq would be
+		 * prevented from being possibly considered as soft
+		 * real time.
 		 *
 		 * If, instead, the queue still has outstanding
 		 * requests, then we have to wait for the completion
 		 * of all the outstanding requests to discover whether
 		 * the request pattern is actually isochronous.
 		 */
-		if (bfqq->dispatched == 0)
+		if (bfqq->dispatched == 0 &&
+		    bfqq->wr_coeff != bfqd->bfq_wr_coeff)
 			bfqq->soft_rt_next_start =
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
 		else if (bfqq->dispatched > 0) {
@@ -4567,9 +4497,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 		    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
 		    bfq_bfqq_budget_left(async_bfqq))
 			bfqq = bfqq->bic->bfqq[0];
-		else if (bfqq->waker_bfqq &&
+		else if (bfq_bfqq_has_waker(bfqq) &&
 			   bfq_bfqq_busy(bfqq->waker_bfqq) &&
-			   bfqq->waker_bfqq->next_rq &&
+			   bfqq->next_rq &&
 			   bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
 					      bfqq->waker_bfqq) <=
 			   bfq_bfqq_budget_left(bfqq->waker_bfqq)
@@ -4629,21 +4559,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 						bfqq->wr_cur_max_time)) {
 			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
 			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
-					       bfq_wr_duration(bfqd))) {
-				/*
-				 * Either in interactive weight
-				 * raising, or in soft_rt weight
-				 * raising with the
-				 * interactive-weight-raising period
-				 * elapsed (so no switch back to
-				 * interactive weight raising).
-				 */
+					       bfq_wr_duration(bfqd)))
 				bfq_bfqq_end_wr(bfqq);
-			} else { /*
-				  * soft_rt finishing while still in
-				  * interactive period, switch back to
-				  * interactive weight raising
-				  */
+			else {
 				switch_back_to_interactive_wr(bfqq, bfqd);
 				bfqq->entity.prio_changed = 1;
 			}
@@ -4722,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
+	if (!atomic_read(&hctx->elevator_queued))
+		return false;
+
 	/*
 	 * Avoiding lock: a race on bfqd->busy_queues should cause at
 	 * most a call to dispatch for nothing
@@ -4971,6 +4892,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
 				  woken_list_node) {
 		item->waker_bfqq = NULL;
+		bfq_clear_bfqq_has_waker(item);
 		hlist_del_init(&item->woken_list_node);
 	}
 
@@ -5090,8 +5012,6 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 	}
 
 	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
-	bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d",
-		     bfqq->new_ioprio, bfqq->entity.new_weight);
 	bfqq->entity.prio_changed = 1;
 }
 
@@ -5129,8 +5049,6 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
 {
-	u64 now_ns = ktime_get_ns();
-
 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
 	INIT_LIST_HEAD(&bfqq->fifo);
 	INIT_HLIST_NODE(&bfqq->burst_list_node);
@@ -5158,9 +5076,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_clear_bfqq_sync(bfqq);
 
 	/* set end request to minus infinity from now */
-	bfqq->ttime.last_end_request = now_ns + 1;
-
-	bfqq->io_start_time = now_ns;
+	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
 
 	bfq_mark_bfqq_IO_bound(bfqq);
 
@@ -5278,19 +5194,11 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 				    struct bfq_queue *bfqq)
 {
 	struct bfq_ttime *ttime = &bfqq->ttime;
-	u64 elapsed;
+	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
 
-	/*
-	 * We are really interested in how long it takes for the queue to
-	 * become busy when there is no outstanding IO for this queue. So
-	 * ignore cases when the bfq queue has already IO queued.
-	 */
-	if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
-		return;
-	elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
 	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
 
-	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
+	ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
 	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
 	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
 				     ttime->ttime_samples);
@@ -5305,26 +5213,8 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	if (bfqq->wr_coeff > 1 &&
 	    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
-	    BFQQ_TOTALLY_SEEKY(bfqq)) {
-		if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
-					   bfq_wr_duration(bfqd))) {
-			/*
-			 * In soft_rt weight raising with the
-			 * interactive-weight-raising period
-			 * elapsed (so no switch back to
-			 * interactive weight raising).
-			 */
-			bfq_bfqq_end_wr(bfqq);
-		} else { /*
-			  * stopping soft_rt weight raising
-			  * while still in interactive period,
-			  * switch back to interactive weight
-			  * raising
-			  */
-			switch_back_to_interactive_wr(bfqq, bfqd);
-			bfqq->entity.prio_changed = 1;
-		}
-	}
+	    BFQQ_TOTALLY_SEEKY(bfqq))
+		bfq_bfqq_end_wr(bfqq);
 }
 
 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -5348,13 +5238,12 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
 		return;
 
 	/* Think time is infinite if no process is linked to
-	 * bfqq. Otherwise check average think time to decide whether
-	 * to mark as has_short_ttime. To this goal, compare average
-	 * think time with half the I/O-plugging timeout.
+	 * bfqq. Otherwise check average think time to
+	 * decide whether to mark as has_short_ttime
 	 */
 	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
 	    (bfq_sample_valid(bfqq->ttime.ttime_samples) &&
-	     bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1))
+	     bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
 		has_short_ttime = false;
 
 	state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
@@ -5668,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		bfq_insert_request(hctx, rq, at_head);
+		atomic_inc(&hctx->elevator_queued);
 	}
 }
 
@@ -6035,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq)
 
 		bfq_completed_request(bfqq, bfqd);
 		bfq_finish_requeue_request_body(bfqq);
+		atomic_dec(&rq->mq_hctx->elevator_queued);
 
 		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
@@ -6598,6 +6489,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->bfq_slice_idle = bfq_slice_idle;
 	bfqd->bfq_timeout = bfq_timeout;
 
+	bfqd->bfq_requests_within_timer = 120;
+
 	bfqd->bfq_large_burst_thresh = 8;
 	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
 
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index b8e793c34ff1..703895224562 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -291,11 +291,6 @@ struct bfq_queue {
 	/* associated @bfq_ttime struct */
 	struct bfq_ttime ttime;
 
-	/* when bfqq started to do I/O within the last observation window */
-	u64 io_start_time;
-	/* how long bfqq has remained empty during the last observ. window */
-	u64 tot_idle_time;
-
 	/* bit vector: a 1 for each seeky requests in history */
 	u32 seek_history;
 
@@ -376,11 +371,6 @@ struct bfq_queue {
 	 * bfq_select_queue().
 	 */
 	struct bfq_queue *waker_bfqq;
-	/* pointer to the curr. tentative waker queue, see bfq_check_waker() */
-	struct bfq_queue *tentative_waker_bfqq;
-	/* number of times the same tentative waker has been detected */
-	unsigned int num_waker_detections;
-
 	/* node for woken_list, see below */
 	struct hlist_node woken_list_node;
 	/*
@@ -417,9 +407,6 @@ struct bfq_io_cq {
 	 */
 	bool saved_IO_bound;
 
-	u64 saved_io_start_time;
-	u64 saved_tot_idle_time;
-
 	/*
 	 * Same purpose as the previous fields for the value of the
 	 * field keeping the queue's belonging to a large burst
@@ -445,15 +432,9 @@ struct bfq_io_cq {
 	 */
 	unsigned long saved_wr_coeff;
 	unsigned long saved_last_wr_start_finish;
-	unsigned long saved_service_from_wr;
 	unsigned long saved_wr_start_at_switch_to_srt;
 	unsigned int saved_wr_cur_max_time;
 	struct bfq_ttime saved_ttime;
-
-	/* Save also injection state */
-	u64 saved_last_serv_time_ns;
-	unsigned int saved_inject_limit;
-	unsigned long saved_decrease_time_jif;
 };
 
 /**
@@ -660,6 +641,14 @@ struct bfq_data {
 	 */
 	unsigned int bfq_timeout;
 
+	/*
+	 * Number of consecutive requests that must be issued within
+	 * the idle time slice to set again idling to a queue which
+	 * was marked as non-I/O-bound (see the definition of the
+	 * IO_bound flag for further details).
+	 */
+	unsigned int bfq_requests_within_timer;
+
 	/*
 	 * Force device idling whenever needed to provide accurate
 	 * service guarantees, without caring about throughput
@@ -781,6 +770,7 @@ enum bfqq_state_flags {
 				 */
 	BFQQF_coop,		/* bfqq is shared */
 	BFQQF_split_coop,	/* shared bfqq will be split */
+	BFQQF_has_waker		/* bfqq has a waker queue */
 };
 
 #define BFQ_BFQQ_FNS(name)						\
@@ -800,6 +790,7 @@ BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
 #undef BFQ_BFQQ_FNS
 
 /* Expiration reasons. */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 070e34a7feb1..26776bdbdf36 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -137,6 +137,9 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
 
 	sd->next_in_service = next_in_service;
 
+	if (!next_in_service)
+		return parent_sched_may_change;
+
 	return parent_sched_may_change;
 }
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index dfa652122a2d..c3e5abcfdc98 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -14,6 +14,8 @@
 #include <linux/slab.h>
 #include "blk.h"
 
+#define BIP_INLINE_VECS	4
+
 static struct kmem_cache *bip_slab;
 static struct workqueue_struct *kintegrityd_wq;
 
@@ -28,7 +30,7 @@ static void __bio_integrity_free(struct bio_set *bs,
 	if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
 		if (bip->bip_vec)
 			bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
-				  bip->bip_max_vcnt);
+				  bip->bip_slab);
 		mempool_free(bip, &bs->bio_integrity_pool);
 	} else {
 		kfree(bip);
@@ -61,7 +63,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 		inline_vecs = nr_vecs;
 	} else {
 		bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
-		inline_vecs = BIO_INLINE_VECS;
+		inline_vecs = BIP_INLINE_VECS;
 	}
 
 	if (unlikely(!bip))
@@ -70,11 +72,14 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	memset(bip, 0, sizeof(*bip));
 
 	if (nr_vecs > inline_vecs) {
-		bip->bip_max_vcnt = nr_vecs;
-		bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
-					  &bip->bip_max_vcnt, gfp_mask);
+		unsigned long idx = 0;
+
+		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+					  &bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
+		bip->bip_max_vcnt = bvec_nr_vecs(idx);
+		bip->bip_slab = idx;
 	} else {
 		bip->bip_vec = bip->bip_inline_vecs;
 		bip->bip_max_vcnt = inline_vecs;
@@ -465,6 +470,6 @@ void __init bio_integrity_init(void)
 
 	bip_slab = kmem_cache_create("bio_integrity_payload",
 				     sizeof(struct bio_integrity_payload) +
-				     sizeof(struct bio_vec) * BIO_INLINE_VECS,
+				     sizeof(struct bio_vec) * BIP_INLINE_VECS,
 				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
diff --git a/block/bio.c b/block/bio.c
index a1c4d2900c7a..4c99ef28fef8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,40 +19,27 @@
 #include <linux/highmem.h>
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
-#include <linux/xarray.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
 #include "blk-rq-qos.h"
 
-static struct biovec_slab {
-	int nr_vecs;
-	char *name;
-	struct kmem_cache *slab;
-} bvec_slabs[] __read_mostly = {
-	{ .nr_vecs = 16, .name = "biovec-16" },
-	{ .nr_vecs = 64, .name = "biovec-64" },
-	{ .nr_vecs = 128, .name = "biovec-128" },
-	{ .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" },
-};
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS		4
 
-static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
-{
-	switch (nr_vecs) {
-	/* smaller bios use inline vecs */
-	case 5 ... 16:
-		return &bvec_slabs[0];
-	case 17 ... 64:
-		return &bvec_slabs[1];
-	case 65 ... 128:
-		return &bvec_slabs[2];
-	case 129 ... BIO_MAX_PAGES:
-		return &bvec_slabs[3];
-	default:
-		BUG();
-		return NULL;
-	}
-}
+/*
+ * if you change this list, also change bvec_alloc or things will
+ * break badly! cannot be bigger than what you can fit into an
+ * unsigned short
+ */
+#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
+static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
+	BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
+};
+#undef BV
 
 /*
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
@@ -71,133 +58,178 @@ struct bio_slab {
 	char name[8];
 };
 static DEFINE_MUTEX(bio_slab_lock);
-static DEFINE_XARRAY(bio_slabs);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
 
-static struct bio_slab *create_bio_slab(unsigned int size)
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
-	struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
-
-	if (!bslab)
-		return NULL;
-
-	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
-	bslab->slab = kmem_cache_create(bslab->name, size,
-			ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
-	if (!bslab->slab)
-		goto fail_alloc_slab;
-
-	bslab->slab_ref = 1;
-	bslab->slab_size = size;
-
-	if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
-		return bslab;
-
-	kmem_cache_destroy(bslab->slab);
-
-fail_alloc_slab:
-	kfree(bslab);
-	return NULL;
-}
-
-static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
-{
-	return bs->front_pad + sizeof(struct bio) + bs->back_pad;
-}
-
-static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
-{
-	unsigned int size = bs_bio_slab_size(bs);
-	struct bio_slab *bslab;
+	unsigned int sz = sizeof(struct bio) + extra_size;
+	struct kmem_cache *slab = NULL;
+	struct bio_slab *bslab, *new_bio_slabs;
+	unsigned int new_bio_slab_max;
+	unsigned int i, entry = -1;
 
 	mutex_lock(&bio_slab_lock);
-	bslab = xa_load(&bio_slabs, size);
-	if (bslab)
-		bslab->slab_ref++;
-	else
-		bslab = create_bio_slab(size);
-	mutex_unlock(&bio_slab_lock);
 
-	if (bslab)
-		return bslab->slab;
-	return NULL;
+	i = 0;
+	while (i < bio_slab_nr) {
+		bslab = &bio_slabs[i];
+
+		if (!bslab->slab && entry == -1)
+			entry = i;
+		else if (bslab->slab_size == sz) {
+			slab = bslab->slab;
+			bslab->slab_ref++;
+			break;
+		}
+		i++;
+	}
+
+	if (slab)
+		goto out_unlock;
+
+	if (bio_slab_nr == bio_slab_max && entry == -1) {
+		new_bio_slab_max = bio_slab_max << 1;
+		new_bio_slabs = krealloc(bio_slabs,
+					 new_bio_slab_max * sizeof(struct bio_slab),
+					 GFP_KERNEL);
+		if (!new_bio_slabs)
+			goto out_unlock;
+		bio_slab_max = new_bio_slab_max;
+		bio_slabs = new_bio_slabs;
+	}
+	if (entry == -1)
+		entry = bio_slab_nr++;
+
+	bslab = &bio_slabs[entry];
+
+	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+	slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+				 SLAB_HWCACHE_ALIGN, NULL);
+	if (!slab)
+		goto out_unlock;
+
+	bslab->slab = slab;
+	bslab->slab_ref = 1;
+	bslab->slab_size = sz;
+out_unlock:
+	mutex_unlock(&bio_slab_lock);
+	return slab;
 }
 
 static void bio_put_slab(struct bio_set *bs)
 {
 	struct bio_slab *bslab = NULL;
-	unsigned int slab_size = bs_bio_slab_size(bs);
+	unsigned int i;
 
 	mutex_lock(&bio_slab_lock);
 
-	bslab = xa_load(&bio_slabs, slab_size);
+	for (i = 0; i < bio_slab_nr; i++) {
+		if (bs->bio_slab == bio_slabs[i].slab) {
+			bslab = &bio_slabs[i];
+			break;
+		}
+	}
+
 	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
 		goto out;
 
-	WARN_ON_ONCE(bslab->slab != bs->bio_slab);
-
 	WARN_ON(!bslab->slab_ref);
 
 	if (--bslab->slab_ref)
 		goto out;
 
-	xa_erase(&bio_slabs, slab_size);
-
 	kmem_cache_destroy(bslab->slab);
-	kfree(bslab);
+	bslab->slab = NULL;
 
 out:
 	mutex_unlock(&bio_slab_lock);
 }
 
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
+unsigned int bvec_nr_vecs(unsigned short idx)
 {
-	BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES);
+	return bvec_slabs[--idx].nr_vecs;
+}
 
-	if (nr_vecs == BIO_MAX_PAGES)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
+{
+	if (!idx)
+		return;
+	idx--;
+
+	BIO_BUG_ON(idx >= BVEC_POOL_NR);
+
+	if (idx == BVEC_POOL_MAX) {
 		mempool_free(bv, pool);
-	else if (nr_vecs > BIO_INLINE_VECS)
-		kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
+	} else {
+		struct biovec_slab *bvs = bvec_slabs + idx;
+
+		kmem_cache_free(bvs->slab, bv);
+	}
 }
 
-/*
- * Make the first allocation restricted and don't dump info on allocation
- * failures, since we'll fall back to the mempool in case of failure.
- */
-static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+			   mempool_t *pool)
 {
-	return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
-		__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-}
+	struct bio_vec *bvl;
 
-struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
-		gfp_t gfp_mask)
-{
-	struct biovec_slab *bvs = biovec_slab(*nr_vecs);
-
-	if (WARN_ON_ONCE(!bvs))
+	/*
+	 * see comment near bvec_array define!
+	 */
+	switch (nr) {
+	case 1:
+		*idx = 0;
+		break;
+	case 2 ... 4:
+		*idx = 1;
+		break;
+	case 5 ... 16:
+		*idx = 2;
+		break;
+	case 17 ... 64:
+		*idx = 3;
+		break;
+	case 65 ... 128:
+		*idx = 4;
+		break;
+	case 129 ... BIO_MAX_PAGES:
+		*idx = 5;
+		break;
+	default:
 		return NULL;
-
-	/*
-	 * Upgrade the nr_vecs request to take full advantage of the allocation.
-	 * We also rely on this in the bvec_free path.
-	 */
-	*nr_vecs = bvs->nr_vecs;
-
-	/*
-	 * Try a slab allocation first for all smaller allocations.  If that
-	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
-	 * The mempool is sized to handle up to BIO_MAX_PAGES entries.
-	 */
-	if (*nr_vecs < BIO_MAX_PAGES) {
-		struct bio_vec *bvl;
-
-		bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
-		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
-			return bvl;
-		*nr_vecs = BIO_MAX_PAGES;
 	}
 
-	return mempool_alloc(pool, gfp_mask);
+	/*
+	 * idx now points to the pool we want to allocate from. only the
+	 * 1-vec entry pool is mempool backed.
+	 */
+	if (*idx == BVEC_POOL_MAX) {
+fallback:
+		bvl = mempool_alloc(pool, gfp_mask);
+	} else {
+		struct biovec_slab *bvs = bvec_slabs + *idx;
+		gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
+
+		/*
+		 * Make this allocation restricted and don't dump info on
+		 * allocation failures, since we'll fallback to the mempool
+		 * in case of failure.
+		 */
+		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+
+		/*
+		 * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
+		 * is set, retry with the 1-entry mempool
+		 */
+		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+		if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
+			*idx = BVEC_POOL_MAX;
+			goto fallback;
+		}
+	}
+
+	(*idx)++;
+	return bvl;
 }
 
 void bio_uninit(struct bio *bio)
@@ -223,7 +255,7 @@ static void bio_free(struct bio *bio)
 	bio_uninit(bio);
 
 	if (bs) {
-		bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -267,8 +299,12 @@ EXPORT_SYMBOL(bio_init);
  */
 void bio_reset(struct bio *bio)
 {
+	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+
 	bio_uninit(bio);
+
 	memset(bio, 0, BIO_RESET_BYTES);
+	bio->bi_flags = flags;
 	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
@@ -369,97 +405,122 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  * @nr_iovecs:	number of iovecs to pre-allocate
  * @bs:		the bio_set to allocate from.
  *
- * Allocate a bio from the mempools in @bs.
+ * Description:
+ *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
+ *   backed by the @bs's mempool.
  *
- * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
- * allocate a bio.  This is due to the mempool guarantees.  To make this work,
- * callers must never allocate more than 1 bio at a time from the general pool.
- * Callers that need to allocate more than 1 bio must always submit the
- * previously allocated bio for IO before attempting to allocate a new one.
- * Failure to do so can cause deadlocks under memory pressure.
+ *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
+ *   always be able to allocate a bio. This is due to the mempool guarantees.
+ *   To make this work, callers must never allocate more than 1 bio at a time
+ *   from this pool. Callers that need to allocate more than 1 bio must always
+ *   submit the previously allocated bio for IO before attempting to allocate
+ *   a new one. Failure to do so can cause deadlocks under memory pressure.
  *
- * Note that when running under submit_bio_noacct() (i.e. any block driver),
- * bios are not submitted until after you return - see the code in
- * submit_bio_noacct() that converts recursion into iteration, to prevent
- * stack overflows.
+ *   Note that when running under submit_bio_noacct() (i.e. any block
+ *   driver), bios are not submitted until after you return - see the code in
+ *   submit_bio_noacct() that converts recursion into iteration, to prevent
+ *   stack overflows.
  *
- * This would normally mean allocating multiple bios under submit_bio_noacct()
- * would be susceptible to deadlocks, but we have
- * deadlock avoidance code that resubmits any blocked bios from a rescuer
- * thread.
+ *   This would normally mean allocating multiple bios under
+ *   submit_bio_noacct() would be susceptible to deadlocks, but we have
+ *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+ *   thread.
  *
- * However, we do not guarantee forward progress for allocations from other
- * mempools. Doing multiple allocations from the same mempool under
- * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
- * for per bio allocations.
+ *   However, we do not guarantee forward progress for allocations from other
+ *   mempools. Doing multiple allocations from the same mempool under
+ *   submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
+ *   for per bio allocations.
  *
- * Returns: Pointer to new bio on success, NULL on failure.
+ *   RETURNS:
+ *   Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
+	unsigned front_pad;
+	unsigned inline_vecs;
+	struct bio_vec *bvl = NULL;
 	struct bio *bio;
 	void *p;
 
-	/* should not use nobvec bioset for nr_iovecs > 0 */
-	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
-		return NULL;
+	if (!bs) {
+		if (nr_iovecs > UIO_MAXIOV)
+			return NULL;
 
-	/*
-	 * submit_bio_noacct() converts recursion to iteration; this means if
-	 * we're running beneath it, any bios we allocate and submit will not be
-	 * submitted (and thus freed) until after we return.
-	 *
-	 * This exposes us to a potential deadlock if we allocate multiple bios
-	 * from the same bio_set() while running underneath submit_bio_noacct().
-	 * If we were to allocate multiple bios (say a stacking block driver
-	 * that was splitting bios), we would deadlock if we exhausted the
-	 * mempool's reserve.
-	 *
-	 * We solve this, and guarantee forward progress, with a rescuer
-	 * workqueue per bio_set. If we go to allocate and there are bios on
-	 * current->bio_list, we first try the allocation without
-	 * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
-	 * blocking to the rescuer workqueue before we retry with the original
-	 * gfp_flags.
-	 */
-	if (current->bio_list &&
-	    (!bio_list_empty(&current->bio_list[0]) ||
-	     !bio_list_empty(&current->bio_list[1])) &&
-	    bs->rescue_workqueue)
-		gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+		p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
+		front_pad = 0;
+		inline_vecs = nr_iovecs;
+	} else {
+		/* should not use nobvec bioset for nr_iovecs > 0 */
+		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+				 nr_iovecs > 0))
+			return NULL;
+		/*
+		 * submit_bio_noacct() converts recursion to iteration; this
+		 * means if we're running beneath it, any bios we allocate and
+		 * submit will not be submitted (and thus freed) until after we
+		 * return.
+		 *
+		 * This exposes us to a potential deadlock if we allocate
+		 * multiple bios from the same bio_set() while running
+		 * underneath submit_bio_noacct(). If we were to allocate
+		 * multiple bios (say a stacking block driver that was splitting
+		 * bios), we would deadlock if we exhausted the mempool's
+		 * reserve.
+		 *
+		 * We solve this, and guarantee forward progress, with a rescuer
+		 * workqueue per bio_set. If we go to allocate and there are
+		 * bios on current->bio_list, we first try the allocation
+		 * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
+		 * bios we would be blocking to the rescuer workqueue before
+		 * we retry with the original gfp_flags.
+		 */
+
+		if (current->bio_list &&
+		    (!bio_list_empty(&current->bio_list[0]) ||
+		     !bio_list_empty(&current->bio_list[1])) &&
+		    bs->rescue_workqueue)
+			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
-	p = mempool_alloc(&bs->bio_pool, gfp_mask);
-	if (!p && gfp_mask != saved_gfp) {
-		punt_bios_to_rescuer(bs);
-		gfp_mask = saved_gfp;
 		p = mempool_alloc(&bs->bio_pool, gfp_mask);
+		if (!p && gfp_mask != saved_gfp) {
+			punt_bios_to_rescuer(bs);
+			gfp_mask = saved_gfp;
+			p = mempool_alloc(&bs->bio_pool, gfp_mask);
+		}
+
+		front_pad = bs->front_pad;
+		inline_vecs = BIO_INLINE_VECS;
 	}
+
 	if (unlikely(!p))
 		return NULL;
 
-	bio = p + bs->front_pad;
-	if (nr_iovecs > BIO_INLINE_VECS) {
-		struct bio_vec *bvl = NULL;
+	bio = p + front_pad;
+	bio_init(bio, NULL, 0);
 
-		bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+	if (nr_iovecs > inline_vecs) {
+		unsigned long idx = 0;
+
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		}
+
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio_init(bio, bvl, nr_iovecs);
+		bio->bi_flags |= idx << BVEC_POOL_OFFSET;
 	} else if (nr_iovecs) {
-		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
-	} else {
-		bio_init(bio, NULL, 0);
+		bvl = bio->bi_inline_vecs;
 	}
 
 	bio->bi_pool = bs;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bvl;
 	return bio;
 
 err_free:
@@ -468,31 +529,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-/**
- * bio_kmalloc - kmalloc a bio for I/O
- * @gfp_mask:   the GFP_* mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
- *
- * Use kmalloc to allocate and initialize a bio.
- *
- * Returns: Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
-{
-	struct bio *bio;
-
-	if (nr_iovecs > UIO_MAXIOV)
-		return NULL;
-
-	bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
-	if (unlikely(!bio))
-		return NULL;
-	bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
-	bio->bi_pool = NULL;
-	return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
-
 void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	unsigned long flags;
@@ -628,7 +664,7 @@ EXPORT_SYMBOL(bio_put);
  */
 void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 {
-	WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
+	BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
 
 	/*
 	 * most users will be overriding ->bi_bdev with a new target,
@@ -638,8 +674,6 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
-	if (bio_flagged(bio_src, BIO_REMAPPED))
-		bio_set_flag(bio, BIO_REMAPPED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_write_hint = bio_src->bi_write_hint;
@@ -949,18 +983,21 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
 }
 EXPORT_SYMBOL_GPL(bio_release_pages);
 
-static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
 {
-	WARN_ON_ONCE(bio->bi_max_vecs);
+	const struct bio_vec *bv = iter->bvec;
+	unsigned int len;
+	size_t size;
 
-	bio->bi_vcnt = iter->nr_segs;
-	bio->bi_io_vec = (struct bio_vec *)iter->bvec;
-	bio->bi_iter.bi_bvec_done = iter->iov_offset;
-	bio->bi_iter.bi_size = iter->count;
-	bio_set_flag(bio, BIO_NO_PAGE_REF);
-	bio_set_flag(bio, BIO_CLONED);
+	if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
+		return -EINVAL;
 
-	iov_iter_advance(iter, iter->count);
+	len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
+	size = bio_add_page(bio, bv->bv_page, len,
+				bv->bv_offset + iter->iov_offset);
+	if (unlikely(size != len))
+		return -EINVAL;
+	iov_iter_advance(iter, size);
 	return 0;
 }
 
@@ -1074,40 +1111,41 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
  * This takes either an iterator pointing to user memory, or one pointing to
  * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  * map them into the kernel. On IO completion, the caller should put those
- * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
- * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
- * to ensure the bvecs and pages stay referenced until the submitted I/O is
- * completed by a call to ->ki_complete() or returns with an error other than
- * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
- * on IO completion. If it isn't, then pages should be released.
+ * pages. If we're adding kernel pages, and the caller told us it's safe to
+ * do so, we just have to add the pages to the bio directly. We don't grab an
+ * extra reference to those pages (the user should already have that), and we
+ * don't put the page on IO completion. The caller needs to check if the bio is
+ * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
+ * released.
  *
  * The function tries, but does not guarantee, to pin as many pages as
  * fit into the bio, or are requested in @iter, whatever is smaller. If
  * MM encounters an error pinning the requested pages, it stops. Error
  * is returned only if 0 pages could be pinned.
- *
- * It's intended for direct IO, so doesn't do PSI tracking, the caller is
- * responsible for setting BIO_WORKINGSET if necessary.
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
-	int ret = 0;
+	const bool is_bvec = iov_iter_is_bvec(iter);
+	int ret;
 
-	if (iov_iter_is_bvec(iter)) {
-		if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
-			return -EINVAL;
-		return bio_iov_bvec_set(bio, iter);
-	}
+	if (WARN_ON_ONCE(bio->bi_vcnt))
+		return -EINVAL;
 
 	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+			if (WARN_ON_ONCE(is_bvec))
+				return -EINVAL;
 			ret = __bio_iov_append_get_pages(bio, iter);
-		else
-			ret = __bio_iov_iter_get_pages(bio, iter);
+		} else {
+			if (is_bvec)
+				ret = __bio_iov_bvec_add_pages(bio, iter);
+			else
+				ret = __bio_iov_iter_get_pages(bio, iter);
+		}
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
-	/* don't account direct I/O as memory stall */
-	bio_clear_flag(bio, BIO_WORKINGSET);
+	if (is_bvec)
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
 	return bio->bi_vcnt ? 0 : ret;
 }
 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@@ -1512,7 +1550,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
  */
 int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
-	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
+	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
 
 	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
@@ -1565,17 +1603,15 @@ int bioset_init(struct bio_set *bs,
 		unsigned int front_pad,
 		int flags)
 {
+	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+
 	bs->front_pad = front_pad;
-	if (flags & BIOSET_NEED_BVECS)
-		bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-	else
-		bs->back_pad = 0;
 
 	spin_lock_init(&bs->rescue_lock);
 	bio_list_init(&bs->rescue_list);
 	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
 
-	bs->bio_slab = bio_find_or_create_slab(bs);
+	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab)
 		return -ENOMEM;
 
@@ -1618,19 +1654,39 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
 }
 EXPORT_SYMBOL(bioset_init_from_src);
 
-static int __init init_bio(void)
+static void __init biovec_init_slabs(void)
 {
 	int i;
 
-	bio_integrity_init();
-
-	for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
+	for (i = 0; i < BVEC_POOL_NR; i++) {
+		int size;
 		struct biovec_slab *bvs = bvec_slabs + i;
 
-		bvs->slab = kmem_cache_create(bvs->name,
-				bvs->nr_vecs * sizeof(struct bio_vec), 0,
-				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
+			bvs->slab = NULL;
+			continue;
+		}
+
+		size = bvs->nr_vecs * sizeof(struct bio_vec);
+		bvs->slab = kmem_cache_create(bvs->name, size, 0,
+                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	}
+}
+
+static int __init init_bio(void)
+{
+	bio_slab_max = 2;
+	bio_slab_nr = 0;
+	bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
+			    GFP_KERNEL);
+
+	BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);
+
+	if (!bio_slabs)
+		panic("bio: can't allocate bios\n");
+
+	bio_integrity_init();
+	biovec_init_slabs();
 
 	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
 		panic("bio: can't allocate bios\n");
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a317c03d40f6..34d812d7cd3a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,6 +32,8 @@
 #include <linux/psi.h>
 #include "blk.h"
 
+#define MAX_KEY_LEN 100
+
 /*
  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -1763,15 +1765,12 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
 	if (unlikely(current->flags & PF_KTHREAD))
 		return;
 
-	if (current->throttle_queue != q) {
-		if (!blk_get_queue(q))
-			return;
-
-		if (current->throttle_queue)
-			blk_put_queue(current->throttle_queue);
-		current->throttle_queue = q;
-	}
+	if (!blk_get_queue(q))
+		return;
 
+	if (current->throttle_queue)
+		blk_put_queue(current->throttle_queue);
+	current->throttle_queue = q;
 	if (use_memdelay)
 		current->use_memdelay = use_memdelay;
 	set_notify_resume(current);
diff --git a/block/blk-core.c b/block/blk-core.c
index 5e752840b41a..64f69022de96 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -531,7 +531,7 @@ struct request_queue *blk_alloc_queue(int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
+	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 	if (ret)
 		goto fail_id;
 
@@ -752,7 +752,7 @@ static int blk_partition_remap(struct bio *bio)
 				      bio->bi_iter.bi_sector -
 				      p->bd_start_sect);
 	}
-	bio_set_flag(bio, BIO_REMAPPED);
+	bio->bi_bdev = bdev_whole(p);
 	return 0;
 }
 
@@ -815,12 +815,10 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		goto end_io;
 	if (unlikely(bio_check_ro(bio)))
 		goto end_io;
-	if (!bio_flagged(bio, BIO_REMAPPED)) {
-		if (unlikely(bio_check_eod(bio)))
-			goto end_io;
-		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
-			goto end_io;
-	}
+	if (unlikely(bio_check_eod(bio)))
+		goto end_io;
+	if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+		goto end_io;
 
 	/*
 	 * Filter flush bio's early so that bio based drivers without flush
@@ -1299,11 +1297,7 @@ void blk_account_io_start(struct request *rq)
 	if (!blk_do_io_stat(rq))
 		return;
 
-	/* passthrough requests can hold bios that do not have ->bi_bdev set */
-	if (rq->bio && rq->bio->bi_bdev)
-		rq->part = rq->bio->bi_bdev;
-	else
-		rq->part = rq->rq_disk->part0;
+	rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
 	part_stat_lock();
 	update_io_ticks(rq->part, jiffies, false);
@@ -1326,17 +1320,14 @@ static unsigned long __part_start_io_acct(struct block_device *part,
 	return now;
 }
 
-/**
- * bio_start_io_acct - start I/O accounting for bio based drivers
- * @bio:	bio to start account for
- *
- * Returns the start time that should be passed back to bio_end_io_acct().
- */
-unsigned long bio_start_io_acct(struct bio *bio)
+unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part,
+				 struct bio *bio)
 {
-	return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio));
+	*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
+
+	return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
 }
-EXPORT_SYMBOL_GPL(bio_start_io_acct);
+EXPORT_SYMBOL_GPL(part_start_io_acct);
 
 unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 				 unsigned int op)
@@ -1359,12 +1350,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op,
 	part_stat_unlock();
 }
 
-void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
-		struct block_device *orig_bdev)
+void part_end_io_acct(struct block_device *part, struct bio *bio,
+		      unsigned long start_time)
 {
-	__part_end_io_acct(orig_bdev, bio_op(bio), start_time);
+	__part_end_io_acct(part, bio_op(bio), start_time);
 }
-EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
+EXPORT_SYMBOL_GPL(part_end_io_acct);
 
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		      unsigned long start_time)
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 07bed8ae2bcd..9f9c4ddc76a5 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -164,12 +164,10 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
 	struct bio_vec bv;
 	struct bio *bio;
 
-	bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src));
+	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
-	if (bio_flagged(bio_src, BIO_REMAPPED))
-		bio_set_flag(bio, BIO_REMAPPED);
 	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index beae70a0e5e5..85324d53d072 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -31,7 +31,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 }
 
 /**
- * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
+ * blk_execute_rq_nowait - insert a request into queue for execution
+ * @q:		queue to insert the request in
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
@@ -44,8 +45,9 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
  * Note:
  *    This function will invoke @done directly if the queue is dead.
  */
-void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
-			   int at_head, rq_end_io_fn *done)
+void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
+			   struct request *rq, int at_head,
+			   rq_end_io_fn *done)
 {
 	WARN_ON(irqs_disabled());
 	WARN_ON(!blk_rq_is_passthrough(rq));
@@ -65,6 +67,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 
 /**
  * blk_execute_rq - insert a request into queue for execution
+ * @q:		queue to insert the request in
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
@@ -73,13 +76,14 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
-void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	unsigned long hang_check;
 
 	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 
 	/* Prevent hang_check timer from firing at us during very long I/O */
 	hang_check = sysctl_hung_task_timeout_secs;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 7942ca6ed321..76c1624cb06c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -432,18 +432,23 @@ void blk_insert_flush(struct request *rq)
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Issue a flush for the block device in question.
  */
-int blkdev_issue_flush(struct block_device *bdev)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
 {
-	struct bio bio;
+	struct bio *bio;
+	int ret = 0;
 
-	bio_init(&bio, NULL, 0);
-	bio_set_dev(&bio, bdev);
-	bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-	return submit_bio_wait(&bio);
+	bio = bio_alloc(gfp_mask, 0);
+	bio_set_dev(bio, bdev);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f21d922ecfaf..74b17b396f4c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1646,42 +1646,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
-/*
- * Is the request queue handled by an IO scheduler that does not respect
- * hardware queues when dispatching?
- */
-static bool blk_mq_has_sqsched(struct request_queue *q)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e && e->type->ops.dispatch_request &&
-	    !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
-		return true;
-	return false;
-}
-
-/*
- * Return prefered queue to dispatch from (if any) for non-mq aware IO
- * scheduler.
- */
-static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	/*
-	 * If the IO scheduler does not respect hardware queues when
-	 * dispatching, we just don't bother with multiple HW queues and
-	 * dispatch from hctx for the current CPU since running multiple queues
-	 * just causes lock contention inside the scheduler and pointless cache
-	 * bouncing.
-	 */
-	hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
-				     raw_smp_processor_id());
-	if (!blk_mq_hctx_stopped(hctx))
-		return hctx;
-	return NULL;
-}
-
 /**
  * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
  * @q: Pointer to the request queue to run.
@@ -1689,23 +1653,14 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
  */
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
-	struct blk_mq_hw_ctx *hctx, *sq_hctx;
+	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	sq_hctx = NULL;
-	if (blk_mq_has_sqsched(q))
-		sq_hctx = blk_mq_get_sq_hctx(q);
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_hctx_stopped(hctx))
 			continue;
-		/*
-		 * Dispatch from this hctx either if there's no hctx preferred
-		 * by IO scheduler or if it has requests that bypass the
-		 * scheduler.
-		 */
-		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
-			blk_mq_run_hw_queue(hctx, async);
+
+		blk_mq_run_hw_queue(hctx, async);
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1717,23 +1672,14 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
  */
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 {
-	struct blk_mq_hw_ctx *hctx, *sq_hctx;
+	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	sq_hctx = NULL;
-	if (blk_mq_has_sqsched(q))
-		sq_hctx = blk_mq_get_sq_hctx(q);
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_hctx_stopped(hctx))
 			continue;
-		/*
-		 * Dispatch from this hctx either if there's no hctx preferred
-		 * by IO scheduler or if it has requests that bypass the
-		 * scheduler.
-		 */
-		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
-			blk_mq_delay_run_hw_queue(hctx, msecs);
+
+		blk_mq_delay_run_hw_queue(hctx, msecs);
 	}
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
@@ -2707,6 +2653,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 		goto free_hctx;
 
 	atomic_set(&hctx->nr_active, 0);
+	atomic_set(&hctx->elevator_queued, 0);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 	hctx->numa_node = node;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7dd8be314ac6..43990b1d148b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,7 +60,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
-	lim->zone_write_granularity = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -367,28 +366,6 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 }
 EXPORT_SYMBOL(blk_queue_physical_block_size);
 
-/**
- * blk_queue_zone_write_granularity - set zone write granularity for the queue
- * @q:  the request queue for the zoned device
- * @size:  the zone write granularity size, in bytes
- *
- * Description:
- *   This should be set to the lowest possible size allowing to write in
- *   sequential zones of a zoned block device.
- */
-void blk_queue_zone_write_granularity(struct request_queue *q,
-				      unsigned int size)
-{
-	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
-		return;
-
-	q->limits.zone_write_granularity = size;
-
-	if (q->limits.zone_write_granularity < q->limits.logical_block_size)
-		q->limits.zone_write_granularity = q->limits.logical_block_size;
-}
-EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
-
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
@@ -654,8 +631,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			t->discard_granularity;
 	}
 
-	t->zone_write_granularity = max(t->zone_write_granularity,
-					b->zone_write_granularity);
 	t->zoned = max(t->zoned, b->zoned);
 	return ret;
 }
@@ -872,8 +847,6 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
  */
 void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 {
-	struct request_queue *q = disk->queue;
-
 	switch (model) {
 	case BLK_ZONED_HM:
 		/*
@@ -892,7 +865,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		 * we do nothing special as far as the block layer is concerned.
 		 */
 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
-		    !xa_empty(&disk->part_tbl))
+		    disk_has_partitions(disk))
 			model = BLK_ZONED_NONE;
 		break;
 	case BLK_ZONED_NONE:
@@ -902,17 +875,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		break;
 	}
 
-	q->limits.zoned = model;
-	if (model != BLK_ZONED_NONE) {
-		/*
-		 * Set the zone write granularity to the device logical block
-		 * size by default. The driver can change this value if needed.
-		 */
-		blk_queue_zone_write_granularity(q,
-						queue_logical_block_size(q));
-	} else {
-		blk_queue_clear_zone_settings(q);
-	}
+	disk->queue->limits.zoned = model;
 }
 EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index ae39c7f3d83d..b513f1683af0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -219,12 +219,6 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
 		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
 }
 
-static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
-						 char *page)
-{
-	return queue_var_show(queue_zone_write_granularity(q), page);
-}
-
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
 	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -591,7 +585,6 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -646,7 +639,6 @@ static struct attribute *queue_attrs[] = {
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
-	&queue_zone_write_granularity_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 42aed0160f86..0321ca83e73f 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -518,7 +518,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 	rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
 }
 
-static inline bool wbt_should_throttle(struct bio *bio)
+static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_WRITE:
@@ -545,7 +545,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
 
 	if (bio_op(bio) == REQ_OP_READ) {
 		flags = WBT_READ;
-	} else if (wbt_should_throttle(bio)) {
+	} else if (wbt_should_throttle(rwb, bio)) {
 		if (current_is_kswapd())
 			flags |= WBT_KSWAPD;
 		if (bio_op(bio) == REQ_OP_DISCARD)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 833978c02e60..7a68b6e4300c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -549,20 +549,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
-void blk_queue_clear_zone_settings(struct request_queue *q)
-{
-	blk_mq_freeze_queue(q);
-
-	blk_queue_free_zone_bitmaps(q);
-	blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
-	q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
-	q->nr_zones = 0;
-	q->max_open_zones = 0;
-	q->max_active_zones = 0;
-	q->limits.chunk_sectors = 0;
-	q->limits.zone_write_granularity = 0;
-	q->limits.max_zone_append_sectors = 0;
-
-	blk_mq_unfreeze_queue(q);
-}
diff --git a/block/blk.h b/block/blk.h
index 3b53e44b967e..10ab7c0d0766 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,11 +55,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 
 void blk_freeze_queue(struct request_queue *q);
 
-#define BIO_INLINE_VECS 4
-struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
-		gfp_t gfp_mask);
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
-
 static inline bool biovec_phys_mergeable(struct request_queue *q,
 		struct bio_vec *vec1, struct bio_vec *vec2)
 {
@@ -334,12 +329,12 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 void blk_queue_free_zone_bitmaps(struct request_queue *q);
-void blk_queue_clear_zone_settings(struct request_queue *q);
 #else
 static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
-static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
 #endif
 
+struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
+
 int blk_alloc_devt(struct block_device *part, dev_t *devt);
 void blk_free_devt(dev_t devt);
 char *disk_name(struct gendisk *hd, int partno, char *buf);
@@ -352,6 +347,7 @@ int bdev_add_partition(struct block_device *bdev, int partno,
 int bdev_del_partition(struct block_device *bdev, int partno);
 int bdev_resize_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length);
+int disk_expand_part_tbl(struct gendisk *disk, int target);
 
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
diff --git a/block/bounce.c b/block/bounce.c
index fc55314aa426..a22a8a1942b2 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -247,8 +247,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
-	if (bio_flagged(bio_src, BIO_REMAPPED))
-		bio_set_flag(bio, BIO_REMAPPED);
 	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
diff --git a/block/bsg.c b/block/bsg.c
index bd10922d5cbb..d7bae94b64d9 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -157,10 +157,8 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 		return PTR_ERR(rq);
 
 	ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode);
-	if (ret) {
-		blk_put_request(rq);
+	if (ret)
 		return ret;
-	}
 
 	rq->timeout = msecs_to_jiffies(hdr.timeout);
 	if (!rq->timeout)
@@ -183,7 +181,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 
 	bio = rq->bio;
 
-	blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
+	blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
 	ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr);
 	blk_rq_unmap_user(bio);
 
diff --git a/block/genhd.c b/block/genhd.c
index 36ff45bbaaaf..55f85c63d687 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -162,6 +162,15 @@ static void part_in_flight_rw(struct block_device *part,
 		inflight[1] = 0;
 }
 
+static struct block_device *__disk_get_part(struct gendisk *disk, int partno)
+{
+	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
+
+	if (unlikely(partno < 0 || partno >= ptbl->len))
+		return NULL;
+	return rcu_dereference(ptbl->part[partno]);
+}
+
 /**
  * disk_part_iter_init - initialize partition iterator
  * @piter: iterator to initialize
@@ -176,14 +185,26 @@ static void part_in_flight_rw(struct block_device *part,
 void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 			  unsigned int flags)
 {
+	struct disk_part_tbl *ptbl;
+
+	rcu_read_lock();
+	ptbl = rcu_dereference(disk->part_tbl);
+
 	piter->disk = disk;
 	piter->part = NULL;
-	if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
+
+	if (flags & DISK_PITER_REVERSE)
+		piter->idx = ptbl->len - 1;
+	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 		piter->idx = 0;
 	else
 		piter->idx = 1;
+
 	piter->flags = flags;
+
+	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(disk_part_iter_init);
 
 /**
  * disk_part_iter_next - proceed iterator to the next partition and return it
@@ -196,30 +217,57 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
  */
 struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 {
-	struct block_device *part;
-	unsigned long idx;
+	struct disk_part_tbl *ptbl;
+	int inc, end;
 
 	/* put the last partition */
 	disk_part_iter_exit(piter);
 
+	/* get part_tbl */
 	rcu_read_lock();
-	xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) {
-		if (!bdev_nr_sectors(part) &&
-		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
-		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
-		      piter->idx == 0))
-			continue;
+	ptbl = rcu_dereference(piter->disk->part_tbl);
 
+	/* determine iteration parameters */
+	if (piter->flags & DISK_PITER_REVERSE) {
+		inc = -1;
+		if (piter->flags & (DISK_PITER_INCL_PART0 |
+				    DISK_PITER_INCL_EMPTY_PART0))
+			end = -1;
+		else
+			end = 0;
+	} else {
+		inc = 1;
+		end = ptbl->len;
+	}
+
+	/* iterate to the next partition */
+	for (; piter->idx != end; piter->idx += inc) {
+		struct block_device *part;
+
+		part = rcu_dereference(ptbl->part[piter->idx]);
+		if (!part)
+			continue;
 		piter->part = bdgrab(part);
 		if (!piter->part)
 			continue;
-		piter->idx = idx + 1;
+		if (!bdev_nr_sectors(part) &&
+		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
+		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
+		      piter->idx == 0)) {
+			bdput(piter->part);
+			piter->part = NULL;
+			continue;
+		}
+
+		piter->idx += inc;
 		break;
 	}
+
 	rcu_read_unlock();
 
 	return piter->part;
 }
+EXPORT_SYMBOL_GPL(disk_part_iter_next);
 
 /**
  * disk_part_iter_exit - finish up partition iteration
@@ -236,6 +284,91 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 		bdput(piter->part);
 	piter->part = NULL;
 }
+EXPORT_SYMBOL_GPL(disk_part_iter_exit);
+
+static inline int sector_in_part(struct block_device *part, sector_t sector)
+{
+	return part->bd_start_sect <= sector &&
+		sector < part->bd_start_sect + bdev_nr_sectors(part);
+}
+
+/**
+ * disk_map_sector_rcu - map sector to partition
+ * @disk: gendisk of interest
+ * @sector: sector to map
+ *
+ * Find out which partition @sector maps to on @disk.  This is
+ * primarily used for stats accounting.
+ *
+ * CONTEXT:
+ * RCU read locked.
+ *
+ * RETURNS:
+ * Found partition on success, part0 is returned if no partition matches
+ * or the matched partition is being deleted.
+ */
+struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
+{
+	struct disk_part_tbl *ptbl;
+	struct block_device *part;
+	int i;
+
+	rcu_read_lock();
+	ptbl = rcu_dereference(disk->part_tbl);
+
+	part = rcu_dereference(ptbl->last_lookup);
+	if (part && sector_in_part(part, sector))
+		goto out_unlock;
+
+	for (i = 1; i < ptbl->len; i++) {
+		part = rcu_dereference(ptbl->part[i]);
+		if (part && sector_in_part(part, sector)) {
+			rcu_assign_pointer(ptbl->last_lookup, part);
+			goto out_unlock;
+		}
+	}
+
+	part = disk->part0;
+out_unlock:
+	rcu_read_unlock();
+	return part;
+}
+
+/**
+ * disk_has_partitions
+ * @disk: gendisk of interest
+ *
+ * Walk through the partition table and check if valid partition exists.
+ *
+ * CONTEXT:
+ * Don't care.
+ *
+ * RETURNS:
+ * True if the gendisk has at least one valid non-zero size partition.
+ * Otherwise false.
+ */
+bool disk_has_partitions(struct gendisk *disk)
+{
+	struct disk_part_tbl *ptbl;
+	int i;
+	bool ret = false;
+
+	rcu_read_lock();
+	ptbl = rcu_dereference(disk->part_tbl);
+
+	/* Iterate partitions skipping the whole device at index 0 */
+	for (i = 1; i < ptbl->len; i++) {
+		if (rcu_dereference(ptbl->part[i])) {
+			ret = true;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disk_has_partitions);
 
 /*
  * Can be deleted altogether. Later.
@@ -471,18 +604,6 @@ static char *bdevt_str(dev_t devt, char *buf)
 	return buf;
 }
 
-void disk_uevent(struct gendisk *disk, enum kobject_action action)
-{
-	struct disk_part_iter piter;
-	struct block_device *part;
-
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(bdev_kobj(part), action);
-	disk_part_iter_exit(&piter);
-}
-EXPORT_SYMBOL_GPL(disk_uevent);
-
 static void disk_scan_partitions(struct gendisk *disk)
 {
 	struct block_device *bdev;
@@ -500,6 +621,8 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 			  const struct attribute_group **groups)
 {
 	struct device *ddev = disk_to_dev(disk);
+	struct disk_part_iter piter;
+	struct block_device *part;
 	int err;
 
 	ddev->parent = parent;
@@ -542,9 +665,15 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 
 	disk_scan_partitions(disk);
 
-	/* announce the disk and partitions after all partitions are created */
+	/* announce disk after possible partitions are created */
 	dev_set_uevent_suppress(ddev, 0);
-	disk_uevent(disk, KOBJ_ADD);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(bdev_kobj(part), KOBJ_ADD);
+	disk_part_iter_exit(&piter);
 
 	if (disk->queue->backing_dev_info->dev) {
 		err = sysfs_create_link(&ddev->kobj,
@@ -700,7 +829,8 @@ void del_gendisk(struct gendisk *disk)
 	down_write(&bdev_lookup_sem);
 
 	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
 		invalidate_partition(part);
 		delete_partition(part);
@@ -799,7 +929,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
 	struct block_device *bdev = NULL;
 
 	rcu_read_lock();
-	bdev = xa_load(&disk->part_tbl, partno);
+	bdev = __disk_get_part(disk, partno);
 	if (bdev && !bdgrab(bdev))
 		bdev = NULL;
 	rcu_read_unlock();
@@ -1189,6 +1319,83 @@ static const struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
+/**
+ * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
+ * @disk: disk to replace part_tbl for
+ * @new_ptbl: new part_tbl to install
+ *
+ * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
+ * original ptbl is freed using RCU callback.
+ *
+ * LOCKING:
+ * Matching bd_mutex locked or the caller is the only user of @disk.
+ */
+static void disk_replace_part_tbl(struct gendisk *disk,
+				  struct disk_part_tbl *new_ptbl)
+{
+	struct disk_part_tbl *old_ptbl =
+		rcu_dereference_protected(disk->part_tbl, 1);
+
+	rcu_assign_pointer(disk->part_tbl, new_ptbl);
+
+	if (old_ptbl) {
+		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+		kfree_rcu(old_ptbl, rcu_head);
+	}
+}
+
+/**
+ * disk_expand_part_tbl - expand disk->part_tbl
+ * @disk: disk to expand part_tbl for
+ * @partno: expand such that this partno can fit in
+ *
+ * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
+ * uses RCU to allow unlocked dereferencing for stats and other stuff.
+ *
+ * LOCKING:
+ * Matching bd_mutex locked or the caller is the only user of @disk.
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int disk_expand_part_tbl(struct gendisk *disk, int partno)
+{
+	struct disk_part_tbl *old_ptbl =
+		rcu_dereference_protected(disk->part_tbl, 1);
+	struct disk_part_tbl *new_ptbl;
+	int len = old_ptbl ? old_ptbl->len : 0;
+	int i, target;
+
+	/*
+	 * check for int overflow, since we can get here from blkpg_ioctl()
+	 * with a user passed 'partno'.
+	 */
+	target = partno + 1;
+	if (target < 0)
+		return -EINVAL;
+
+	/* disk_max_parts() is zero during initialization, ignore if so */
+	if (disk_max_parts(disk) && target > disk_max_parts(disk))
+		return -EINVAL;
+
+	if (target <= len)
+		return 0;
+
+	new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
+				disk->node_id);
+	if (!new_ptbl)
+		return -ENOMEM;
+
+	new_ptbl->len = target;
+
+	for (i = 0; i < len; i++)
+		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
+
+	disk_replace_part_tbl(disk, new_ptbl);
+	return 0;
+}
+
 /**
  * disk_release - releases all allocated resources of the gendisk
  * @dev: the device representing this disk
@@ -1212,7 +1419,7 @@ static void disk_release(struct device *dev)
 	blk_free_devt(dev->devt);
 	disk_release_events(disk);
 	kfree(disk->random);
-	xa_destroy(&disk->part_tbl);
+	disk_replace_part_tbl(disk, NULL);
 	bdput(disk->part0);
 	if (disk->queue)
 		blk_put_queue(disk->queue);
@@ -1365,6 +1572,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
 struct gendisk *__alloc_disk_node(int minors, int node_id)
 {
 	struct gendisk *disk;
+	struct disk_part_tbl *ptbl;
 
 	if (minors > DISK_MAX_PARTS) {
 		printk(KERN_ERR
@@ -1382,9 +1590,11 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 		goto out_free_disk;
 
 	disk->node_id = node_id;
-	xa_init(&disk->part_tbl);
-	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
-		goto out_destroy_part_tbl;
+	if (disk_expand_part_tbl(disk, 0))
+		goto out_bdput;
+
+	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+	rcu_assign_pointer(ptbl->part[0], disk->part0);
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
@@ -1393,8 +1603,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	device_initialize(disk_to_dev(disk));
 	return disk;
 
-out_destroy_part_tbl:
-	xa_destroy(&disk->part_tbl);
+out_bdput:
 	bdput(disk->part0);
 out_free_disk:
 	kfree(disk);
@@ -1432,7 +1641,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 /**
  * set_disk_ro - set a gendisk read-only
  * @disk:	gendisk to operate on
- * @read_only:	%true to set the disk read-only, %false set the disk read/write
+ * @ready_only:	%true to set the disk read-only, %false set the disk read/write
  *
  * This function is used to indicate whether a given disk device should have its
  * read-only flag set. set_disk_ro() is typically used by device drivers to
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index c25c41d0d061..dc89199bc8c6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1029,7 +1029,6 @@ static struct elevator_type kyber_sched = {
 #endif
 	.elevator_attrs = kyber_sched_attrs,
 	.elevator_name = "kyber",
-	.elevator_features = ELEVATOR_F_MQ_AWARE,
 	.elevator_owner = THIS_MODULE,
 };
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b57470e154c8..800ac902809b 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -386,6 +386,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	spin_lock(&dd->lock);
 	rq = __dd_dispatch_request(dd);
 	spin_unlock(&dd->lock);
+	if (rq)
+		atomic_dec(&rq->mq_hctx->elevator_queued);
 
 	return rq;
 }
@@ -533,6 +535,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		dd_insert_request(hctx, rq, at_head);
+		atomic_inc(&hctx->elevator_queued);
 	}
 	spin_unlock(&dd->lock);
 }
@@ -579,6 +582,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 
+	if (!atomic_read(&hctx->elevator_queued))
+		return false;
+
 	return !list_empty_careful(&dd->dispatch) ||
 		!list_empty_careful(&dd->fifo_list[0]) ||
 		!list_empty_careful(&dd->fifo_list[1]);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index f3d9ff2cafb6..5e916fa1d6dd 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -289,7 +289,13 @@ struct device_type part_type = {
  */
 void delete_partition(struct block_device *part)
 {
-	xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
+	struct gendisk *disk = part->bd_disk;
+	struct disk_part_tbl *ptbl =
+		rcu_dereference_protected(disk->part_tbl, 1);
+
+	rcu_assign_pointer(ptbl->part[part->bd_partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
+
 	kobject_put(part->bd_holder_dir);
 	device_del(&part->bd_device);
 
@@ -321,6 +327,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
 	struct block_device *bdev;
+	struct disk_part_tbl *ptbl;
 	const char *dname;
 	int err;
 
@@ -336,13 +343,18 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	case BLK_ZONED_HA:
 		pr_info("%s: disabling host aware zoned block device support due to partitions\n",
 			disk->disk_name);
-		blk_queue_set_zoned(disk, BLK_ZONED_NONE);
+		disk->queue->limits.zoned = BLK_ZONED_NONE;
 		break;
 	case BLK_ZONED_NONE:
 		break;
 	}
 
-	if (xa_load(&disk->part_tbl, partno))
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return ERR_PTR(err);
+	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+
+	if (ptbl->part[partno])
 		return ERR_PTR(-EBUSY);
 
 	bdev = bdev_alloc(disk, partno);
@@ -395,10 +407,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
-	err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
-	if (err)
-		goto out_del;
 	bdev_add(bdev, devt);
+	rcu_assign_pointer(ptbl->part[partno], bdev);
 
 	/* suppress uevent if the disk suppresses it */
 	if (!dev_get_uevent_suppress(ddev))
@@ -604,7 +614,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 {
 	struct parsed_partitions *state;
-	int ret = -EAGAIN, p;
+	int ret = -EAGAIN, p, highest;
 
 	if (!disk_part_scan_enabled(disk))
 		return 0;
@@ -652,6 +662,15 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
+	/*
+	 * Detect the highest partition number and preallocate disk->part_tbl.
+	 * This is an optimization and not strictly necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
+	disk_expand_part_tbl(disk, highest);
+
 	for (p = 1; p < state->limit; p++)
 		if (!blk_add_partition(disk, bdev, state, p))
 			goto out_free_state;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 6599bac0a78c..c9f009cc0446 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	 * (if he doesn't check that is his problem).
 	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
 	 */
-	blk_execute_rq(bd_disk, rq, at_head);
+	blk_execute_rq(q, bd_disk, rq, at_head);
 
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
@@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 	}
 
-	blk_execute_rq(disk, rq, 0);
+	blk_execute_rq(q, disk, rq, 0);
 
 	err = req->result & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
@@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[0] = cmd;
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
-	blk_execute_rq(bd_disk, rq, 0);
+	blk_execute_rq(q, bd_disk, rq, 0);
 	err = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 72cf7603d51f..7227fc7ab8ed 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -138,7 +138,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 		op_flags |= REQ_FUA | REQ_PREFLUSH;
 	op_flags |= REQ_SYNC;
 
-	bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
+	bio = bio_alloc_drbd(GFP_NOIO);
 	bio_set_dev(bio, bdev->md_bdev);
 	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index c1f816f896a8..df53dca5d02c 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -976,7 +976,7 @@ static void drbd_bm_endio(struct bio *bio)
 
 static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
-	struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	struct page *page;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 02db50d7e4c6..b2c93a29c251 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1422,6 +1422,8 @@ extern mempool_t drbd_md_io_page_pool;
 /* We also need to make sure we get a bio
  * when we need it for housekeeping purposes */
 extern struct bio_set drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
 /* And a bio_set for cloning */
 extern struct bio_set drbd_io_bio_set;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 788dd97e6026..1c8c18b2a25f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -138,6 +138,19 @@ static const struct block_device_operations drbd_ops = {
 	.release	= drbd_release,
 };
 
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!bioset_initialized(&drbd_md_io_bio_set))
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	return bio;
+}
+
 #ifdef __CHECKER__
 /* When checking with sparse, and this is an inline function, sparse will
    give tons of false positives. When this is a real functions sparse works.
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9dbb660a7d7c..ea0f31ab3343 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -30,10 +30,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 		return NULL;
 	memset(req, 0, sizeof(*req));
 
-	req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
-	req->private_bio->bi_private = req;
-	req->private_bio->bi_end_io = drbd_request_endio;
-
+	drbd_req_make_private_bio(req, bio_src);
 	req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
 		      | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
 		      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 511f39a08de4..55bb0f8721fa 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -256,6 +256,18 @@ enum drbd_req_state_bits {
 #define MR_WRITE       1
 #define MR_READ        2
 
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+	struct bio *bio;
+	bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
+
+	req->private_bio = bio;
+
+	bio->bi_private  = req;
+	bio->bi_end_io   = drbd_request_endio;
+	bio->bi_next     = NULL;
+}
+
 /* Short lived temporary struct on the stack.
  * We could squirrel the error to be returned into
  * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 64563bfdf0da..02044ab7f767 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1523,11 +1523,8 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
 		drbd_al_begin_io(device, &req->i);
 
-	req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO,
-					  &drbd_io_bio_set);
+	drbd_req_make_private_bio(req, req->master_bio);
 	bio_set_dev(req->private_bio, device->ldev->backing_bdev);
-	req->private_bio->bi_private = req;
-	req->private_bio->bi_end_io = drbd_request_endio;
 	submit_bio_noacct(req->private_bio);
 
 	return 0;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3fd99836bb1c..53ac59d19ae5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	rq->timeout = timeout;
 
 	/* insert request and run queue */
-	blk_execute_rq(NULL, rq, true);
+	blk_execute_rq(rq->q, NULL, rq, true);
 
 	if (int_cmd->status) {
 		dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n",
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index bfcab1c782b5..fce0a54df0e5 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -148,6 +148,10 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
 		sector += dev->zone_size_sects;
 	}
 
+	q->limits.zoned = BLK_ZONED_HM;
+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
+
 	return 0;
 }
 
@@ -156,10 +160,6 @@ int null_register_zoned_dev(struct nullb *nullb)
 	struct nullb_device *dev = nullb->dev;
 	struct request_queue *q = nullb->q;
 
-	blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM);
-	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
-	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
-
 	if (queue_is_mq(q)) {
 		int ret = blk_revalidate_disk_zones(nullb->disk, NULL);
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 897acda20ac8..a7af4f27b7c3 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk,
 	req = blk_mq_rq_to_pdu(rq);
 
 	req->func = func;
-	blk_execute_rq(disk->gd, rq, 0);
+	blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
 	blk_put_request(rq);
 	return 0;
 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index fc4b0f1aa86d..658a0981cb54 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	if (cgc->quiet)
 		rq->rq_flags |= RQF_QUIET;
 
-	blk_execute_rq(pd->bdev->bd_disk, rq, 0);
+	blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
 	if (scsi_req(rq)->result)
 		ret = -EIO;
 out:
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 2cdf2771f8e8..4478eb7efee0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(NULL, rq, true, NULL);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 
@@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(NULL, rq, true, NULL);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 }
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 37c409f3cd83..db3903e24d78 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -322,7 +322,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	if (err)
 		goto out;
 
-	blk_execute_rq(vblk->disk, req, false);
+	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
 out:
 	blk_put_request(req);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 90ad34c6ef8e..8f0e52a71493 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
-		blk_execute_rq(cdi->disk, rq, 0);
+		blk_execute_rq(q, cdi->disk, rq, 0);
 		if (scsi_req(rq)->result) {
 			struct scsi_sense_hdr sshdr;
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index a1ce9f5ac3aa..013ad33fbbc8 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-	blk_execute_rq(disk, rq, 0);
+	blk_execute_rq(drive->queue, disk, rq, 0);
 	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index cffbcc27a34c..25d2d88e82ad 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 			}
 		}
 
-		blk_execute_rq(info->disk, rq, 0);
+		blk_execute_rq(drive->queue, info->disk, rq, 0);
 		error = scsi_req(rq)->result ? -EIO : 0;
 
 		if (buffer)
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 011eab9c69b7..46f2df288c6a 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
-	blk_execute_rq(cd->disk, rq, 0);
+	blk_execute_rq(drive->queue, cd->disk, rq, 0);
 	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	/*
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index ca1d4b3d3878..f2f93ed40356 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
 	ide_req(rq)->special = setting->set;
 
-	blk_execute_rq(NULL, rq, 0);
+	blk_execute_rq(q, NULL, rq, 0);
 	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 8413731c6259..34b9441084f8 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	blk_execute_rq(NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 
 	return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 43fbc37d85c3..58994da10c06 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp)
 
 		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
-		blk_execute_rq(NULL, rq, 0);
+		blk_execute_rq(drive->queue, NULL, rq, 0);
 		err = scsi_req(rq)->result ? -EIO : 0;
 		blk_put_request(rq);
 
@@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-	blk_execute_rq(NULL, rq, 1);
+	blk_execute_rq(drive->queue, NULL, rq, 1);
 	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 	return ret;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index a80a0f28f7b9..8af7af6001eb 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	ide_req(rq)->special = &timeout;
-	blk_execute_rq(NULL, rq, 1);
+	blk_execute_rq(q, NULL, rq, 1);
 	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index d680b3e3295f..82ab308f1aaf 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		mesg.event = PM_EVENT_FREEZE;
 	rqpm.pm_state = mesg.event;
 
-	blk_execute_rq(NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
 	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
@@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq)
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	blk_execute_rq(NULL, rq, true);
+	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fa05e7e7d609..88b96437b22e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 			goto out_put;
 	}
 
-	blk_execute_rq(tape->disk, rq, 0);
+	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
 	size -= scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 6665fc4724b9..d016cbe68cba 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
-	blk_execute_rq(NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
 	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 63e809f38e3f..058dd8014428 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
 	if (!check)
 		return;
-	bio_set_dev(check, bio->bi_bdev);
+	check->bi_bdev = bio->bi_bdev;
 	check->bi_opf = REQ_OP_READ;
 	check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 	check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 29c231758293..dfc35d6d05ed 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -475,7 +475,7 @@ struct search {
 	unsigned int		read_dirty_data:1;
 	unsigned int		cache_missed:1;
 
-	struct block_device	*orig_bdev;
+	struct block_device	*part;
 	unsigned long		start_time;
 
 	struct btree_op		op;
@@ -670,8 +670,8 @@ static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
 		/* Count on bcache device */
-		bio_end_io_acct_remapped(s->orig_bio, s->start_time,
-					 s->orig_bdev);
+		part_end_io_acct(s->part, s->orig_bio, s->start_time);
+
 		trace_bcache_request_end(s->d, s->orig_bio);
 		s->orig_bio->bi_status = s->iop.status;
 		bio_endio(s->orig_bio);
@@ -714,8 +714,7 @@ static void search_free(struct closure *cl)
 }
 
 static inline struct search *search_alloc(struct bio *bio,
-		struct bcache_device *d, struct block_device *orig_bdev,
-		unsigned long start_time)
+					  struct bcache_device *d)
 {
 	struct search *s;
 
@@ -733,8 +732,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->write		= op_is_write(bio_op(bio));
 	s->read_dirty_data	= 0;
 	/* Count on the bcache device */
-	s->orig_bdev		= orig_bdev;
-	s->start_time		= start_time;
+	s->start_time		= part_start_io_acct(d->disk, &s->part, bio);
 	s->iop.c		= d->c;
 	s->iop.bio		= NULL;
 	s->iop.inode		= d->id;
@@ -1076,7 +1074,7 @@ struct detached_dev_io_private {
 	unsigned long		start_time;
 	bio_end_io_t		*bi_end_io;
 	void			*bi_private;
-	struct block_device	*orig_bdev;
+	struct block_device	*part;
 };
 
 static void detached_dev_end_io(struct bio *bio)
@@ -1088,7 +1086,7 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_private = ddip->bi_private;
 
 	/* Count on the bcache device */
-	bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev);
+	part_end_io_acct(ddip->part, bio, ddip->start_time);
 
 	if (bio->bi_status) {
 		struct cached_dev *dc = container_of(ddip->d,
@@ -1101,8 +1099,7 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_end_io(bio);
 }
 
-static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
-		struct block_device *orig_bdev, unsigned long start_time)
+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
 {
 	struct detached_dev_io_private *ddip;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1115,8 +1112,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
 	ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
 	ddip->d = d;
 	/* Count on the bcache device */
-	ddip->orig_bdev = orig_bdev;
-	ddip->start_time = start_time;
+	ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
 	ddip->bi_end_io = bio->bi_end_io;
 	ddip->bi_private = bio->bi_private;
 	bio->bi_end_io = detached_dev_end_io;
@@ -1172,10 +1168,8 @@ static void quit_max_writeback_rate(struct cache_set *c,
 blk_qc_t cached_dev_submit_bio(struct bio *bio)
 {
 	struct search *s;
-	struct block_device *orig_bdev = bio->bi_bdev;
-	struct bcache_device *d = orig_bdev->bd_disk->private_data;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
-	unsigned long start_time;
 	int rw = bio_data_dir(bio);
 
 	if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
@@ -1200,13 +1194,11 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
 		}
 	}
 
-	start_time = bio_start_io_acct(bio);
-
 	bio_set_dev(bio, dc->bdev);
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
 
 	if (cached_dev_get(dc)) {
-		s = search_alloc(bio, d, orig_bdev, start_time);
+		s = search_alloc(bio, d);
 		trace_bcache_request_start(s->d, bio);
 
 		if (!bio->bi_iter.bi_size) {
@@ -1227,7 +1219,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
 		}
 	} else
 		/* I/O request sent to backing device */
-		detached_dev_do_request(d, bio, orig_bdev, start_time);
+		detached_dev_do_request(d, bio);
 
 	return BLK_QC_T_NONE;
 }
@@ -1291,7 +1283,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
+	s = search_alloc(bio, d);
 	cl = &s->cl;
 	bio = &s->bio.bio;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 193fe7652329..2047a9cccdb5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1939,7 +1939,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 		goto err;
 
 	if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
-			BIOSET_NEED_RESCUER))
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
 		goto err;
 
 	c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index a90bdf9b2ca6..bdb255edc200 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -85,6 +85,12 @@ struct clone {
 
 	struct dm_clone_metadata *cmd;
 
+	/*
+	 * bio used to flush the destination device, before committing the
+	 * metadata.
+	 */
+	struct bio flush_bio;
+
 	/* Region hydration hash table */
 	struct hash_table_bucket *ht;
 
@@ -1149,7 +1155,11 @@ static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
 		goto out;
 	}
 
-	r = blkdev_issue_flush(clone->dest_dev->bdev);
+	bio_reset(&clone->flush_bio);
+	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
+	clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+	r = submit_bio_wait(&clone->flush_bio);
 	if (unlikely(r)) {
 		__metadata_operation_failed(clone, "flush destination device", r);
 		goto out;
@@ -1876,6 +1886,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	bio_list_init(&clone->deferred_flush_completions);
 	clone->hydration_offset = 0;
 	atomic_set(&clone->hydrations_in_flight, 0);
+	bio_init(&clone->flush_bio, NULL, 0);
 
 	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
 	if (!clone->wq) {
@@ -1947,6 +1958,7 @@ static void clone_dtr(struct dm_target *ti)
 	struct clone *clone = ti->private;
 
 	mutex_destroy(&clone->commit_lock);
+	bio_uninit(&clone->flush_bio);
 
 	for (i = 0; i < clone->nr_ctr_args; i++)
 		kfree(clone->ctr_args[i]);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 039d17b28938..b298fefb022e 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 	ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
 			     mblk->page);
 	if (ret == 0)
-		ret = blkdev_issue_flush(dev->bdev);
+		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
 
 	return ret;
 }
@@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 
 	/* Flush drive cache (this will also sync data) */
 	if (ret == 0)
-		ret = blkdev_issue_flush(dev->bdev);
+		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
 
 	return ret;
 }
@@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
 
 	/* If there are no dirty metadata blocks, just flush the device cache */
 	if (list_empty(&write_list)) {
-		ret = blkdev_issue_flush(dev->bdev);
+		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
 		goto err;
 	}
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21da0c48f6c2..cf06dbb1aa53 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -340,6 +340,24 @@ static int start_readonly;
  */
 static bool create_on_open = true;
 
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+			    struct mddev *mddev)
+{
+	if (!mddev || !bioset_initialized(&mddev->bio_set))
+		return bio_alloc(gfp_mask, nr_iovecs);
+
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+static struct bio *md_bio_alloc_sync(struct mddev *mddev)
+{
+	if (!mddev || !bioset_initialized(&mddev->sync_set))
+		return bio_alloc(GFP_NOIO, 1);
+
+	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
+}
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -445,8 +463,8 @@ struct md_io {
 	struct mddev *mddev;
 	bio_end_io_t *orig_bi_end_io;
 	void *orig_bi_private;
-	struct block_device *orig_bi_bdev;
 	unsigned long start_time;
+	struct block_device *part;
 };
 
 static void md_end_io(struct bio *bio)
@@ -454,7 +472,7 @@ static void md_end_io(struct bio *bio)
 	struct md_io *md_io = bio->bi_private;
 	struct mddev *mddev = md_io->mddev;
 
-	bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev);
+	part_end_io_acct(md_io->part, bio, md_io->start_time);
 
 	bio->bi_end_io = md_io->orig_bi_end_io;
 	bio->bi_private = md_io->orig_bi_private;
@@ -496,12 +514,12 @@ static blk_qc_t md_submit_bio(struct bio *bio)
 		md_io->mddev = mddev;
 		md_io->orig_bi_end_io = bio->bi_end_io;
 		md_io->orig_bi_private = bio->bi_private;
-		md_io->orig_bi_bdev = bio->bi_bdev;
 
 		bio->bi_end_io = md_end_io;
 		bio->bi_private = md_io;
 
-		md_io->start_time = bio_start_io_acct(bio);
+		md_io->start_time = part_start_io_acct(mddev->gendisk,
+						       &md_io->part, bio);
 	}
 
 	/* bio could be mergeable after passing to underlayer */
@@ -595,7 +613,7 @@ static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set);
+			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bio_set_dev(bi, rdev->bdev);
@@ -981,7 +999,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	if (test_bit(Faulty, &rdev->flags))
 		return;
 
-	bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
+	bio = md_bio_alloc_sync(mddev);
 
 	atomic_inc(&rdev->nr_pending);
 
@@ -1013,29 +1031,29 @@ int md_super_wait(struct mddev *mddev)
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		 struct page *page, int op, int op_flags, bool metadata_op)
 {
-	struct bio bio;
-	struct bio_vec bvec;
-
-	bio_init(&bio, &bvec, 1);
+	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
+	int ret;
 
 	if (metadata_op && rdev->meta_bdev)
-		bio_set_dev(&bio, rdev->meta_bdev);
+		bio_set_dev(bio, rdev->meta_bdev);
 	else
-		bio_set_dev(&bio, rdev->bdev);
-	bio.bi_opf = op | op_flags;
+		bio_set_dev(bio, rdev->bdev);
+	bio_set_op_attrs(bio, op, op_flags);
 	if (metadata_op)
-		bio.bi_iter.bi_sector = sector + rdev->sb_start;
+		bio->bi_iter.bi_sector = sector + rdev->sb_start;
 	else if (rdev->mddev->reshape_position != MaxSector &&
 		 (rdev->mddev->reshape_backwards ==
 		  (sector >= rdev->mddev->reshape_position)))
-		bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
+		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
 	else
-		bio.bi_iter.bi_sector = sector + rdev->data_offset;
-	bio_add_page(&bio, page, size, 0);
+		bio->bi_iter.bi_sector = sector + rdev->data_offset;
+	bio_add_page(bio, page, size, 0);
 
-	submit_bio_wait(&bio);
+	submit_bio_wait(bio);
 
-	return !bio.bi_status;
+	ret = !bio->bi_status;
+	bio_put(bio);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sync_page_io);
 
@@ -2399,12 +2417,6 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
 
-static bool rdev_read_only(struct md_rdev *rdev)
-{
-	return bdev_read_only(rdev->bdev) ||
-		(rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
-}
-
 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 {
 	char b[BDEVNAME_SIZE];
@@ -2414,7 +2426,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	if (find_rdev(mddev, rdev->bdev->bd_dev))
 		return -EEXIST;
 
-	if (rdev_read_only(rdev) && mddev->pers)
+	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
+	    mddev->pers)
 		return -EROFS;
 
 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
@@ -5848,7 +5861,9 @@ int md_run(struct mddev *mddev)
 			continue;
 		sync_blockdev(rdev->bdev);
 		invalidate_bdev(rdev->bdev);
-		if (mddev->ro != 1 && rdev_read_only(rdev)) {
+		if (mddev->ro != 1 &&
+		    (bdev_read_only(rdev->bdev) ||
+		     bdev_read_only(rdev->meta_bdev))) {
 			mddev->ro = 1;
 			if (mddev->gendisk)
 				set_disk_ro(mddev->gendisk, 1);
@@ -6143,7 +6158,7 @@ static int restart_array(struct mddev *mddev)
 		if (test_bit(Journal, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))
 			has_journal = true;
-		if (rdev_read_only(rdev))
+		if (bdev_read_only(rdev->bdev))
 			has_readonly = true;
 	}
 	rcu_read_unlock();
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bcbba1b5ec4a..f13290ccc1c2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -742,6 +742,8 @@ extern void md_rdev_clear(struct md_rdev *rdev);
 extern void md_handle_request(struct mddev *mddev, struct bio *bio);
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
+extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+				   struct mddev *mddev);
 
 extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d2378765dc15..3b19141cdb4b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1104,7 +1104,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
 	int i = 0;
 	struct bio *behind_bio = NULL;
 
-	behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set);
+	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
 	if (!behind_bio)
 		return;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a9ae7d113492..be8f14afb6d1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4531,7 +4531,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		return sectors_done;
 	}
 
-	read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
+	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
 
 	bio_set_dev(read_bio, rdev->bdev);
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
@@ -4539,6 +4539,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	read_bio->bi_private = r10_bio;
 	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
+	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
+	read_bio->bi_status = 0;
+	read_bio->bi_vcnt = 0;
+	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index e8c118e05dfd..d0f540296fe9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
 	}
 
 	/* flush the disk cache after recovery if necessary */
-	ret = blkdev_issue_flush(rdev->bdev);
+	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL);
 out:
 	__free_page(page);
 	return ret;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a348b2adf2a9..f411b9e5c332 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5393,72 +5393,90 @@ static void raid5_align_endio(struct bio *bi)
 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 {
 	struct r5conf *conf = mddev->private;
-	struct bio *align_bio;
+	int dd_idx;
+	struct bio* align_bi;
 	struct md_rdev *rdev;
-	sector_t sector, end_sector, first_bad;
-	int bad_sectors, dd_idx;
+	sector_t end_sector;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
 		pr_debug("%s: non aligned\n", __func__);
 		return 0;
 	}
+	/*
+	 * use bio_clone_fast to make a copy of the bio
+	 */
+	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
+	if (!align_bi)
+		return 0;
+	/*
+	 *   set bi_end_io to a new function, and set bi_private to the
+	 *     original bio.
+	 */
+	align_bi->bi_end_io  = raid5_align_endio;
+	align_bi->bi_private = raid_bio;
+	/*
+	 *	compute position
+	 */
+	align_bi->bi_iter.bi_sector =
+		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+				     0, &dd_idx, NULL);
 
-	sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
-				      &dd_idx, NULL);
-	end_sector = bio_end_sector(raid_bio);
-
+	end_sector = bio_end_sector(align_bi);
 	rcu_read_lock();
-	if (r5c_big_stripe_cached(conf, sector))
-		goto out_rcu_unlock;
-
 	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
 	if (!rdev || test_bit(Faulty, &rdev->flags) ||
 	    rdev->recovery_offset < end_sector) {
 		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
-		if (!rdev)
-			goto out_rcu_unlock;
-		if (test_bit(Faulty, &rdev->flags) ||
+		if (rdev &&
+		    (test_bit(Faulty, &rdev->flags) ||
 		    !(test_bit(In_sync, &rdev->flags) ||
-		      rdev->recovery_offset >= end_sector))
-			goto out_rcu_unlock;
+		      rdev->recovery_offset >= end_sector)))
+			rdev = NULL;
 	}
 
-	atomic_inc(&rdev->nr_pending);
-	rcu_read_unlock();
-
-	align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
-	bio_set_dev(align_bio, rdev->bdev);
-	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = raid_bio;
-	align_bio->bi_iter.bi_sector = sector;
-
-	raid_bio->bi_next = (void *)rdev;
-
-	if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad,
-			&bad_sectors)) {
-		bio_put(align_bio);
-		rdev_dec_pending(rdev, mddev);
+	if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+		rcu_read_unlock();
+		bio_put(align_bi);
 		return 0;
 	}
 
-	/* No reshape active, so we can trust rdev->data_offset */
-	align_bio->bi_iter.bi_sector += rdev->data_offset;
+	if (rdev) {
+		sector_t first_bad;
+		int bad_sectors;
 
-	spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-			    conf->device_lock);
-	atomic_inc(&conf->active_aligned_reads);
-	spin_unlock_irq(&conf->device_lock);
+		atomic_inc(&rdev->nr_pending);
+		rcu_read_unlock();
+		raid_bio->bi_next = (void*)rdev;
+		bio_set_dev(align_bi, rdev->bdev);
 
-	if (mddev->gendisk)
-		trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
-				      raid_bio->bi_iter.bi_sector);
-	submit_bio_noacct(align_bio);
-	return 1;
+		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
+				bio_sectors(align_bi),
+				&first_bad, &bad_sectors)) {
+			bio_put(align_bi);
+			rdev_dec_pending(rdev, mddev);
+			return 0;
+		}
 
-out_rcu_unlock:
-	rcu_read_unlock();
-	return 0;
+		/* No reshape active, so we can trust rdev->data_offset */
+		align_bi->bi_iter.bi_sector += rdev->data_offset;
+
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_quiescent,
+				    conf->quiesce == 0,
+				    conf->device_lock);
+		atomic_inc(&conf->active_aligned_reads);
+		spin_unlock_irq(&conf->device_lock);
+
+		if (mddev->gendisk)
+			trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk),
+					      raid_bio->bi_iter.bi_sector);
+		submit_bio_noacct(align_bi);
+		return 1;
+	} else {
+		rcu_read_unlock();
+		bio_put(align_bi);
+		return 0;
+	}
 }
 
 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index a1d6b68320ae..42e27a298218 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
 		goto out_put;
 	}
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
-	blk_execute_rq(NULL, req, 0);
+	blk_execute_rq(mq->queue, NULL, req, 0);
 	ret = req_to_mmc_queue_req(req)->drv_op_result;
 	blk_put_request(req);
 
@@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
 		rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
 	req_to_mmc_queue_req(req)->drv_op_data = idatas;
 	req_to_mmc_queue_req(req)->ioc_count = 1;
-	blk_execute_rq(NULL, req, 0);
+	blk_execute_rq(mq->queue, NULL, req, 0);
 	ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
 	err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
 	blk_put_request(req);
@@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
 		rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
 	req_to_mmc_queue_req(req)->drv_op_data = idata;
 	req_to_mmc_queue_req(req)->ioc_count = num_of_cmds;
-	blk_execute_rq(NULL, req, 0);
+	blk_execute_rq(mq->queue, NULL, req, 0);
 	ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
 
 	/* copy to user if data and response */
@@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
-	blk_execute_rq(NULL, req, 0);
+	blk_execute_rq(mq->queue, NULL, req, 0);
 	ret = req_to_mmc_queue_req(req)->drv_op_result;
 	if (ret >= 0) {
 		*val = ret;
@@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
 	}
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD;
 	req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
-	blk_execute_rq(NULL, req, 0);
+	blk_execute_rq(mq->queue, NULL, req, 0);
 	err = req_to_mmc_queue_req(req)->drv_op_result;
 	blk_put_request(req);
 	if (err) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 89cacff897a8..ad6a4ae5ed6f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q,
 
 	rq->cmd_flags |= REQ_HIPRI;
 	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq);
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
 
 	while (!completion_done(&wait)) {
 		blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
@@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	if (poll)
 		nvme_execute_rq_polled(req->q, NULL, req, at_head);
 	else
-		blk_execute_rq(NULL, req, at_head);
+		blk_execute_rq(req->q, NULL, req, at_head);
 	if (result)
 		*result = nvme_req(req)->result;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq)
 	u32 effects;
 
 	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
-	blk_execute_rq(disk, rq, 0);
+	blk_execute_rq(rq->q, disk, rq, 0);
 	nvme_passthru_end(ctrl, effects);
 }
 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
@@ -1133,8 +1133,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		if (ret)
 			goto out;
 		bio = req->bio;
-		if (bdev)
-			bio_set_dev(bio, bdev);
+		bio->bi_bdev = bdev;
 		if (bdev && meta_buffer && meta_len) {
 			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
 					meta_seed, write);
@@ -1203,7 +1202,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl)
 	rq->timeout = ctrl->kato * HZ;
 	rq->end_io_data = ctrl;
 
-	blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);
+	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
 
 	return 0;
 }
@@ -2176,18 +2175,17 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	ns->lba_shift = id->lbaf[lbaf].ds;
 	nvme_set_queue_limits(ns->ctrl, ns->queue);
 
-	ret = nvme_configure_metadata(ns, id);
-	if (ret)
-		goto out_unfreeze;
-	nvme_set_chunk_sectors(ns, id);
-	nvme_update_disk_info(ns->disk, ns, id);
-
 	if (ns->head->ids.csi == NVME_CSI_ZNS) {
 		ret = nvme_update_zone_info(ns, lbaf);
 		if (ret)
 			goto out_unfreeze;
 	}
 
+	ret = nvme_configure_metadata(ns, id);
+	if (ret)
+		goto out_unfreeze;
+	nvme_set_chunk_sectors(ns, id);
+	nvme_update_disk_info(ns->disk, ns, id);
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
 	if (blk_queue_is_zoned(ns->queue)) {
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index b705988629f2..6c8eab8de288 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
 
 	rq->end_io_data = rqd;
 
-	blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io);
+	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
 
 	return 0;
 
@@ -816,10 +816,10 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
 		}
 
-		bio_set_dev(bio, ns->disk->part0);
+		bio->bi_bdev = ns->disk->part0;
 	}
 
-	blk_execute_rq(NULL, rq, 0);
+	blk_execute_rq(q, NULL, rq, 0);
 
 	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
 		ret = -EINTR;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1427c9555cef..7f63a51a53df 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
 	if (likely(ns)) {
-		bio_set_dev(bio, ns->disk->part0);
+		bio->bi_bdev = ns->disk->part0;
 		bio->bi_opf |= REQ_NVME_MPATH;
 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
 				      bio->bi_iter.bi_sector);
@@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work)
 		 * Reset disk to the mpath node and resubmit to select a new
 		 * path.
 		 */
-		bio_set_dev(bio, head->disk->part0);
+		bio->bi_bdev = head->disk->part0;
 		submit_bio_noacct(bio);
 	}
 }
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 808ab5186928..6bad4d4dcdf0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	}
 
 	abort_req->end_io_data = NULL;
-	blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
+	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
 
 	/*
 	 * The aborted req will be completed on receiving the abort req.
@@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	req->end_io_data = nvmeq;
 
 	init_completion(&nvmeq->delete_done);
-	blk_execute_rq_nowait(NULL, req, false,
+	blk_execute_rq_nowait(q, NULL, req, false,
 			opcode == nvme_admin_delete_cq ?
 				nvme_del_cq_end : nvme_del_queue_end);
 	return 0;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index c7e3ec561ba0..1dfe9a3500e3 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -9,7 +9,13 @@
 
 int nvme_revalidate_zones(struct nvme_ns *ns)
 {
-	return blk_revalidate_disk_zones(ns->disk, NULL);
+	struct request_queue *q = ns->queue;
+	int ret;
+
+	ret = blk_revalidate_disk_zones(ns->disk, NULL);
+	if (!ret)
+		blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
+	return ret;
 }
 
 static int nvme_set_max_append(struct nvme_ctrl *ctrl)
@@ -103,11 +109,10 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 		goto free_data;
 	}
 
-	blk_queue_set_zoned(ns->disk, BLK_ZONED_HM);
+	q->limits.zoned = BLK_ZONED_HM;
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
 	blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
-	blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
 free_data:
 	kfree(id);
 	return status;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index bf6e0ac9ad28..125dde3f410e 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
 
 u16 nvmet_bdev_flush(struct nvmet_req *req)
 {
-	if (blkdev_issue_flush(req->ns->bdev))
+	if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL))
 		return NVME_SC_INTERNAL | NVME_SC_DNR;
 	return 0;
 }
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index cbc88acdd233..b9776fc8f08f 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
 		schedule_work(&req->p.work);
 	} else {
 		rq->end_io_data = req;
-		blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0,
+		blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
 				      nvmet_passthru_req_done);
 	}
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 28c04a4efa66..c7eb9a10c680 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -428,15 +428,23 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device)
 static int
 dasd_state_ready_to_online(struct dasd_device * device)
 {
+	struct gendisk *disk;
+	struct disk_part_iter piter;
+	struct block_device *part;
+
 	device->state = DASD_STATE_ONLINE;
 	if (device->block) {
 		dasd_schedule_block_bh(device->block);
 		if ((device->features & DASD_FEATURE_USERAW)) {
-			kobject_uevent(&disk_to_dev(device->block->gdp)->kobj,
-					KOBJ_CHANGE);
+			disk = device->block->gdp;
+			kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 			return 0;
 		}
-		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
+		disk = device->block->bdev->bd_disk;
+		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+		while ((part = disk_part_iter_next(&piter)))
+			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
+		disk_part_iter_exit(&piter);
 	}
 	return 0;
 }
@@ -447,6 +455,9 @@ dasd_state_ready_to_online(struct dasd_device * device)
 static int dasd_state_online_to_ready(struct dasd_device *device)
 {
 	int rc;
+	struct gendisk *disk;
+	struct disk_part_iter piter;
+	struct block_device *part;
 
 	if (device->discipline->online_to_ready) {
 		rc = device->discipline->online_to_ready(device);
@@ -455,8 +466,13 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 	}
 
 	device->state = DASD_STATE_READY;
-	if (device->block && !(device->features & DASD_FEATURE_USERAW))
-		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
+	if (device->block && !(device->features & DASD_FEATURE_USERAW)) {
+		disk = device->block->bdev->bd_disk;
+		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+		while ((part = disk_part_iter_next(&piter)))
+			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
+		disk_part_iter_exit(&piter);
+	}
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c00f06e9ecb0..f11f51e2465f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	req->timeout = 10 * HZ;
 	rq->retries = 5;
 
-	blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done);
+	blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
 }
 
 /**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 4d2280658559..b3f14f05340a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	/*
 	 * head injection *required* here otherwise quiesce won't work
 	 */
-	blk_execute_rq(NULL, req, 1);
+	blk_execute_rq(req->q, NULL, req, 1);
 
 	/*
 	 * Some devices (USB mass-storage in particular) may transfer
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 03adb39293c2..cf07b7f93579 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -665,28 +665,12 @@ static int sd_zbc_init_disk(struct scsi_disk *sdkp)
 	return 0;
 }
 
-static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp)
+void sd_zbc_release_disk(struct scsi_disk *sdkp)
 {
-	/* Serialize against revalidate zones */
-	mutex_lock(&sdkp->rev_mutex);
-
 	kvfree(sdkp->zones_wp_offset);
 	sdkp->zones_wp_offset = NULL;
 	kfree(sdkp->zone_wp_update_buf);
 	sdkp->zone_wp_update_buf = NULL;
-
-	sdkp->nr_zones = 0;
-	sdkp->rev_nr_zones = 0;
-	sdkp->zone_blocks = 0;
-	sdkp->rev_zone_blocks = 0;
-
-	mutex_unlock(&sdkp->rev_mutex);
-}
-
-void sd_zbc_release_disk(struct scsi_disk *sdkp)
-{
-	if (sd_is_zoned(sdkp))
-		sd_zbc_clear_zone_info(sdkp);
 }
 
 static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
@@ -785,21 +769,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 		 */
 		return 0;
 
-	/* READ16/WRITE16 is mandatory for ZBC disks */
-	sdkp->device->use_16_for_rw = 1;
-	sdkp->device->use_10_for_rw = 0;
-
-	if (!blk_queue_is_zoned(q)) {
-		/*
-		 * This can happen for a host aware disk with partitions.
-		 * The block device zone information was already cleared
-		 * by blk_queue_set_zoned(). Only clear the scsi disk zone
-		 * information and exit early.
-		 */
-		sd_zbc_clear_zone_info(sdkp);
-		return 0;
-	}
-
 	/* Check zoned block device characteristics (unconstrained reads) */
 	ret = sd_zbc_check_zoned_characteristics(sdkp, buf);
 	if (ret)
@@ -820,13 +789,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 	blk_queue_max_active_zones(q, 0);
 	nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
 
-	/*
-	 * Per ZBC and ZAC specifications, writes in sequential write required
-	 * zones of host-managed devices must be aligned to the device physical
-	 * block size.
-	 */
-	if (blk_queue_zoned_model(q) == BLK_ZONED_HM)
-		blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
+	/* READ16/WRITE16 is mandatory for ZBC disks */
+	sdkp->device->use_16_for_rw = 1;
+	sdkp->device->use_10_for_rw = 0;
 
 	sdkp->rev_nr_zones = nr_zones;
 	sdkp->rev_zone_blocks = zone_blocks;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 4383d93110f8..bfa8d77322d7 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -829,7 +829,8 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 
 	srp->rq->timeout = timeout;
 	kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
-	blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io);
+	blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
+			      srp->rq, at_head, sg_rq_end_io);
 	return 0;
 }
 
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 841ad2fc369a..43f7624508a9 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
-	blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end);
+	blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
 	return 0;
 }
 
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index cce455929778..b0cb5b95e892 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -241,7 +241,6 @@ struct target_core_file_cmd {
 	unsigned long	len;
 	struct se_cmd	*cmd;
 	struct kiocb	iocb;
-	struct bio_vec	bvecs[];
 };
 
 static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
@@ -269,22 +268,29 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	struct target_core_file_cmd *aio_cmd;
 	struct iov_iter iter = {};
 	struct scatterlist *sg;
+	struct bio_vec *bvec;
 	ssize_t len = 0;
 	int ret = 0, i;
 
-	aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL);
+	aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL);
 	if (!aio_cmd)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
+	bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL);
+	if (!bvec) {
+		kfree(aio_cmd);
+		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+	}
+
 	for_each_sg(sgl, sg, sgl_nents, i) {
-		aio_cmd->bvecs[i].bv_page = sg_page(sg);
-		aio_cmd->bvecs[i].bv_len = sg->length;
-		aio_cmd->bvecs[i].bv_offset = sg->offset;
+		bvec[i].bv_page = sg_page(sg);
+		bvec[i].bv_len = sg->length;
+		bvec[i].bv_offset = sg->offset;
 
 		len += sg->length;
 	}
 
-	iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len);
+	iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len);
 
 	aio_cmd->cmd = cmd;
 	aio_cmd->len = len;
@@ -301,6 +307,8 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	else
 		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
 
+	kfree(bvec);
+
 	if (ret != -EIOCBQUEUED)
 		cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
 
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 33770e5808ce..7994f27e4527 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1000,7 +1000,8 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		req->timeout = PS_TIMEOUT_OTHER;
 	scsi_req(req)->retries = PS_RETRY;
 
-	blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG),
+	blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
+			(cmd->sam_task_attr == TCM_HEAD_TAG),
 			pscsi_req_done);
 
 	return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec26179c8062..235b5042672e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -126,6 +126,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
 		bd_abort_claiming(bdev, truncate_bdev_range);
 	return 0;
 }
+EXPORT_SYMBOL(truncate_bdev_range);
 
 static void set_init_blocksize(struct block_device *bdev)
 {
@@ -423,7 +424,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
-		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
+		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
 		if (!nr_pages) {
 			bool polled = false;
 
@@ -488,10 +489,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	int nr_pages;
 
-	if (!iov_iter_count(iter))
+	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
+	if (!nr_pages)
 		return 0;
-
-	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
@@ -688,7 +688,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	 * i_mutex and doing so causes performance issues with concurrent
 	 * O_SYNC writers to a block device.
 	 */
-	error = blkdev_issue_flush(bdev);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
@@ -1808,11 +1808,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 		return error;
 
 	/*
-	 * Invalidate the page cache again; if someone wandered in and dirtied
-	 * a page, we just discard it - userspace has no way of knowing whether
-	 * the write happened before or after discard completing...
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.  The third argument is
+	 * inclusive, so the rounding here is safe.
 	 */
-	return truncate_bdev_range(bdev, file->f_mode, start, end);
+	return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+					     start >> PAGE_SHIFT,
+					     end >> PAGE_SHIFT);
 }
 
 const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3b33efddc5..b8fab44394f5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
 	 * Preallocate a bio that's always going to be used for flushing device
 	 * barriers and matches the device lifespan
 	 */
-	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 	if (!dev->flush_bio) {
 		kfree(dev);
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 00c419e49f0e..4134e2e19090 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -431,8 +431,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	unsigned long flags;
 
 	bio->bi_private = dio;
-	/* don't account direct I/O as memory stall */
-	bio_clear_flag(bio, BIO_WORKINGSET);
 
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	dio->refcount++;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 183ffdf4d43c..a92478eabfa4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	return blkdev_issue_flush(inode->i_sb->s_bdev);
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 }
 
 const struct file_operations exfat_file_operations = {
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 6e8208acfc62..0a14a7c87bf8 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	 * flush before we start writing fast commit blocks.
 	 */
 	if (journal->j_fs_dev != journal->j_dev)
-		blkdev_issue_flush(journal->j_fs_dev);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
 
 	blk_start_plug(&plug);
 	if (sbi->s_fc_bytes == 0) {
@@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
 out:
 	iput(inode);
 	if (!ret)
-		blkdev_issue_flush(sb->s_bdev);
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
 
 	return 0;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 027a7d7037a0..113bfb023a4a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
 
 	if (needs_barrier) {
-		err = blkdev_issue_flush(inode->i_sb->s_bdev);
+		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 		if (!ret)
 			ret = err;
 	}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index faa912862591..14aef148a968 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1586,7 +1586,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 	if (ret < 0)
 		goto err_out;
 	if (barrier)
-		blkdev_issue_flush(sb->s_bdev);
+		blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
 
 skip_zeroout:
 	ext4_lock_group(sb, group);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 06fe22fe7b51..81c47b75ff99 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5703,7 +5703,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 		needs_barrier = true;
 	if (needs_barrier) {
 		int err;
-		err = blkdev_issue_flush(sb->s_bdev);
+		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
 		if (!ret)
 			ret = err;
 	}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eba1546b5ea1..ffcddabfd11d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -50,6 +50,27 @@ void f2fs_destroy_bioset(void)
 	bioset_exit(&f2fs_bioset);
 }
 
+static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
+						unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
+}
+
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
+{
+	if (noio) {
+		/* No failure on bio allocation */
+		return __f2fs_bio_alloc(GFP_NOIO, npages);
+	}
+
+	if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+		f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
+		return NULL;
+	}
+
+	return __f2fs_bio_alloc(GFP_KERNEL, npages);
+}
+
 static bool __is_cp_guaranteed(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -383,7 +404,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 	struct f2fs_sb_info *sbi = fio->sbi;
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
+	bio = f2fs_bio_alloc(sbi, npages, true);
 
 	f2fs_target_device(sbi, fio->new_blkaddr, bio);
 	if (is_read_io(fio->op)) {
@@ -973,9 +994,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	struct bio_post_read_ctx *ctx;
 	unsigned int post_read_steps = 0;
 
-	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
-			       min_t(int, nr_pages, BIO_MAX_PAGES),
-			       &f2fs_bioset);
+	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
+								for_write);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 52b7c7d0a351..f0aec024e5df 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -43,6 +43,7 @@ enum {
 	FAULT_KVMALLOC,
 	FAULT_PAGE_ALLOC,
 	FAULT_PAGE_GET,
+	FAULT_ALLOC_BIO,
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@@ -3481,6 +3482,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
  */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_bio(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 993004f06a77..440634dfaa56 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -563,7 +563,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	int ret = blkdev_issue_flush(bdev);
+	struct bio *bio;
+	int ret;
+
+	bio = f2fs_bio_alloc(sbi, 0, false);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
+	bio_set_dev(bio, bdev);
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
 
 	trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
 				test_opt(sbi, FLUSH_MERGE), ret);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4acfa7d36731..30d5abef4361 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -46,6 +46,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_KVMALLOC]	= "kvmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
 	[FAULT_PAGE_GET]	= "page get",
+	[FAULT_ALLOC_BIO]	= "alloc bio",
 	[FAULT_ALLOC_NID]	= "alloc nid",
 	[FAULT_ORPHAN]		= "orphan",
 	[FAULT_BLOCK]		= "no more block",
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 5fee74f1ad61..f9ee27cf4d7c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	return blkdev_issue_flush(inode->i_sb->s_bdev);
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 }
 
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index ca464328b79c..e3da9e96b835 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	}
 
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
-		blkdev_issue_flush(inode->i_sb->s_bdev);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 
 	inode_unlock(inode);
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b9e3db3f855f..807119ae5adf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -239,7 +239,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	mutex_unlock(&sbi->vh_mutex);
 
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
-		blkdev_issue_flush(sb->s_bdev);
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
 
 	return error;
 }
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4e339bba6afb..25675c1fa44d 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -283,8 +283,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	orig_count = iov_iter_count(dio->submit.iter);
 	iov_iter_truncate(dio->submit.iter, length);
 
-	if (!iov_iter_count(dio->submit.iter))
+	nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0) {
+		ret = nr_pages;
 		goto out;
+	}
 
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
@@ -300,7 +303,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	 */
 	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
 
-	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
 	do {
 		size_t n;
 		if (dio->error) {
@@ -343,8 +345,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		dio->size += n;
 		copied += n;
 
-		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
-						 BIO_MAX_PAGES);
+		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
 		iomap_dio_submit_bio(dio, iomap, bio, pos);
 		pos += n;
 	} while (nr_pages);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 63b526d44886..472932b9e6bc 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * jbd2_cleanup_journal_tail() doesn't get called all that often.
 	 */
 	if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(journal->j_fs_dev);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
 
 	return __jbd2_update_log_tail(journal, first_tid, blocknr);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3cc4ab2ba7f4..b121d7d434c6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -825,7 +825,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (commit_transaction->t_need_data_flush &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (jbd2_has_feature_async_commit(journal)) {
@@ -932,7 +932,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	stats.run.rs_blocks_logged++;
 	if (jbd2_has_feature_async_commit(journal) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev);
+		blkdev_issue_flush(journal->j_dev, GFP_NOFS);
 	}
 
 	if (err)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 69f18fe20923..dc0694fcfcd1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal)
 		err = err2;
 	/* Make sure all replayed data is on permanent storage */
 	if (journal->j_flags & JBD2_BARRIER) {
-		err2 = blkdev_issue_flush(journal->j_fs_dev);
+		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL);
 		if (!err)
 			err = err2;
 	}
diff --git a/fs/libfs.c b/fs/libfs.c
index abf7674fb437..79721571e014 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
 	err = __generic_file_fsync(file, start, end, datasync);
 	if (err)
 		return err;
-	return blkdev_issue_flush(inode->i_sb->s_bdev);
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 }
 EXPORT_SYMBOL(generic_file_fsync);
 
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1a96ce28efb0..3be6836074ae 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -123,6 +123,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
 
 	npg = min(npg, BIO_MAX_PAGES);
 	bio = bio_alloc(GFP_NOIO, npg);
+	if (!bio && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (npg /= 2))
+			bio = bio_alloc(GFP_NOIO, npg);
+	}
+
 	if (bio) {
 		bio->bi_iter.bi_sector = disk_sector;
 		bio_set_dev(bio, bdev);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 1058659a8d31..a07c39c94bbd 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -254,7 +254,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	req->cmd[4] = bufflen & 0xff;
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
-	blk_execute_rq(NULL, rq, 1);
+	blk_execute_rq(rq->q, NULL, rq, 1);
 	if (req->result) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
 			req->result);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e75417bfe6e..1a8729eded8b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -386,6 +386,10 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_NOIO, nr_vecs);
+	if (bio == NULL) {
+		while (!bio && (nr_vecs >>= 1))
+			bio = bio_alloc(GFP_NOIO, nr_vecs);
+	}
 	if (likely(bio)) {
 		bio_set_dev(bio, nilfs->ns_bdev);
 		bio->bi_iter.bi_sector =
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 987c8ab02aee..b55cdeb4d169 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs)
 	 */
 	smp_wmb();
 
-	err = blkdev_issue_flush(nilfs->ns_bdev);
+	err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL);
 	if (err != -EIO)
 		err = 0;
 	return err;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index df6d709d2ae3..85979e2214b3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
 		needs_barrier = true;
 	err = jbd2_complete_transaction(journal, commit_tid);
 	if (needs_barrier) {
-		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 		if (!err)
 			err = ret;
 	}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1db0254bc38b..0b641ae694f1 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 	inode_unlock(inode);
 	if (barrier_done < 0)
 		return barrier_done;
diff --git a/fs/splice.c b/fs/splice.c
index 5dbce4dcc1a7..b06846f1e6ee 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -662,14 +662,12 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		/* build the vector */
 		left = sd.total_len;
-		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
+		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
 			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 			size_t this_len = buf->len;
 
-			/* zero-length bvecs are not supported, skip them */
-			if (!this_len)
-				continue;
-			this_len = min(this_len, left);
+			if (this_len > left)
+				this_len = left;
 
 			ret = pipe_buf_confirm(pipe, buf);
 			if (unlikely(ret)) {
@@ -682,7 +680,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			array[n].bv_len = this_len;
 			array[n].bv_offset = buf->offset;
 			left -= this_len;
-			n++;
 		}
 
 		iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 586d42342a79..21b1d034aca3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -343,7 +343,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
 }
 
 STATIC void
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index a29653c0196b..0e7ab0bc00ae 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
 		ret = file_write_and_wait_range(file, start, end);
 	if (!ret)
-		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
 
 	if (ret)
 		zonefs_io_error(inode, true);
@@ -678,7 +678,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 	if (!nr_pages)
 		return 0;
 
-	bio = bio_alloc(GFP_NOFS, nr_pages);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
 	if (!bio)
 		return -ENOMEM;
 
@@ -1581,11 +1581,12 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_time_gran	= 1;
 
 	/*
-	 * The block size is set to the device zone write granularity to ensure
-	 * that write operations are always aligned according to the device
-	 * interface constraints.
+	 * The block size is set to the device physical sector size to ensure
+	 * that write operations on 512e devices (512B logical block and 4KB
+	 * physical block) are always aligned to the device physical blocks,
+	 * as mandated by the ZBC/ZAC specifications.
 	 */
-	sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
+	sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
 	sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
 	sbi->s_uid = GLOBAL_ROOT_UID;
 	sbi->s_gid = GLOBAL_ROOT_GID;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5b468f2242ff..77c7d3fd00e7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -10,7 +10,6 @@
 #include <linux/ioprio.h>
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
-#include <linux/uio.h>
 
 #define BIO_DEBUG
 
@@ -329,6 +328,7 @@ struct bio_integrity_payload {
 
 	struct bvec_iter	bip_iter;
 
+	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
 	unsigned short		bip_max_vcnt;	/* integrity bio_vec slots */
 	unsigned short		bip_flags;	/* control flags */
@@ -406,9 +406,7 @@ extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 
-struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
-		struct bio_set *bs);
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
@@ -416,11 +414,16 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 
 extern struct bio_set fs_bio_set;
 
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
+static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+}
+
 extern blk_qc_t submit_bio(struct bio *);
 
 extern void bio_endio(struct bio *);
@@ -438,18 +441,6 @@ static inline void bio_wouldblock_error(struct bio *bio)
 	bio_endio(bio);
 }
 
-/*
- * Calculate number of bvec segments that should be allocated to fit data
- * pointed by @iter. If @iter is backed by bvec it's going to be reused
- * instead of allocating a new one.
- */
-static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
-{
-	if (iov_iter_is_bvec(iter))
-		return 0;
-	return iov_iter_npages(iter, max_segs);
-}
-
 struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
@@ -489,11 +480,13 @@ static inline void zero_fill_bio(struct bio *bio)
 	zero_fill_bio_iter(bio, bio->bi_iter);
 }
 
+extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
+extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
+extern unsigned int bvec_nr_vecs(unsigned short idx);
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_set_dev(bio, bdev) 				\
 do {							\
-	bio_clear_flag(bio, BIO_REMAPPED);		\
 	if ((bio)->bi_bdev != (bdev))			\
 		bio_clear_flag(bio, BIO_THROTTLED);	\
 	(bio)->bi_bdev = (bdev);			\
@@ -502,7 +495,6 @@ do {							\
 
 #define bio_copy_dev(dst, src)			\
 do {						\
-	bio_clear_flag(dst, BIO_REMAPPED);		\
 	(dst)->bi_bdev = (src)->bi_bdev;	\
 	bio_clone_blkg_association(dst, src);	\
 } while (0)
@@ -711,7 +703,6 @@ struct bio_set {
 	mempool_t bvec_integrity_pool;
 #endif
 
-	unsigned int back_pad;
 	/*
 	 * Deadlock avoidance for stacking block drivers: see comments in
 	 * bio_alloc_bioset() for details
@@ -722,6 +713,12 @@ struct bio_set {
 	struct workqueue_struct	*rescue_workqueue;
 };
 
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
 static inline bool bioset_initialized(struct bio_set *bs)
 {
 	return bs->bio_slab != NULL;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aabbf6830ffc..6b410dab48ee 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,10 @@ struct blk_mq_hw_ctx {
 	 * shared across request queues.
 	 */
 	atomic_t		nr_active;
+	/**
+	 * @elevator_queued: Number of queued requests on hctx.
+	 */
+	atomic_t                elevator_queued;
 
 	/** @cpuhp_online: List to store request if CPU is going to die */
 	struct hlist_node	cpuhp_online;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9ec28fcd3bcc..4667720ca095 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -227,7 +227,7 @@ struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* BIO_* below */
+	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
@@ -306,10 +306,36 @@ enum {
 				 * of this bio. */
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
 	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
-	BIO_REMAPPED,
 	BIO_FLAG_LAST
 };
 
+/* See BVEC_POOL_OFFSET below before adding new flags */
+
+/*
+ * We support 6 different bvec pools, the last one is magic in that it
+ * is backed by a mempool.
+ */
+#define BVEC_POOL_NR		6
+#define BVEC_POOL_MAX		(BVEC_POOL_NR - 1)
+
+/*
+ * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
+ * 1 to the actual index so that 0 indicates that there are no bvecs to be
+ * freed.
+ */
+#define BVEC_POOL_BITS		(3)
+#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
+#define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS	BVEC_POOL_OFFSET
+
 typedef __u32 __bitwise blk_mq_req_flags_t;
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9149f4a5adb3..b55bd534b2e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -337,7 +337,6 @@ struct queue_limits {
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
-	unsigned int		zone_write_granularity;
 
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -949,8 +948,9 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern void blk_execute_rq(struct gendisk *, struct request *, int);
-extern void blk_execute_rq_nowait(struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
+			  struct request *, int);
+extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
@@ -1161,8 +1161,6 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
 		unsigned int max_zone_append_sectors);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
-void blk_queue_zone_write_granularity(struct request_queue *q,
-				      unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 void blk_queue_update_readahead(struct request_queue *q);
@@ -1291,7 +1289,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 		 !list_empty(&plug->cb_list));
 }
 
-int blkdev_issue_flush(struct block_device *bdev);
+int blkdev_issue_flush(struct block_device *, gfp_t);
 long nr_blockdev_pages(void);
 #else /* CONFIG_BLOCK */
 struct blk_plug {
@@ -1319,7 +1317,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	return false;
 }
 
-static inline int blkdev_issue_flush(struct block_device *bdev)
+static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
 {
 	return 0;
 }
@@ -1476,18 +1474,6 @@ static inline int bdev_io_opt(struct block_device *bdev)
 	return queue_io_opt(bdev_get_queue(bdev));
 }
 
-static inline unsigned int
-queue_zone_write_granularity(const struct request_queue *q)
-{
-	return q->limits.zone_write_granularity;
-}
-
-static inline unsigned int
-bdev_zone_write_granularity(struct block_device *bdev)
-{
-	return queue_zone_write_granularity(bdev_get_queue(bdev));
-}
-
 static inline int queue_alignment_offset(const struct request_queue *q)
 {
 	if (q->limits.misaligned)
@@ -1968,9 +1954,22 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		unsigned long start_time);
 
-unsigned long bio_start_io_acct(struct bio *bio);
-void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
-		struct block_device *orig_bdev);
+unsigned long part_start_io_acct(struct gendisk *disk,
+		struct block_device **part, struct bio *bio);
+void part_end_io_acct(struct block_device *part, struct bio *bio,
+		      unsigned long start_time);
+
+/**
+ * bio_start_io_acct - start I/O accounting for bio based drivers
+ * @bio:	bio to start account for
+ *
+ * Returns the start time that should be passed back to bio_end_io_acct().
+ */
+static inline unsigned long bio_start_io_acct(struct bio *bio)
+{
+	return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio),
+				  bio_op(bio));
+}
 
 /**
  * bio_end_io_acct - end I/O accounting for bio based drivers
@@ -1979,7 +1978,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
  */
 static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 {
-	return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
+	return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time);
 }
 
 int bdev_read_only(struct block_device *bdev);
@@ -2014,16 +2013,21 @@ void bdev_add(struct block_device *bdev, dev_t dev);
 struct block_device *I_BDEV(struct inode *inode);
 struct block_device *bdgrab(struct block_device *bdev);
 void bdput(struct block_device *);
-int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
-		loff_t lend);
 
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+			loff_t lend);
 int sync_blockdev(struct block_device *bdev);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
 }
+static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+				      loff_t lstart, loff_t lend)
+{
+	return 0;
+}
 static inline int sync_blockdev(struct block_device *bdev)
 {
 	return 0;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1fe8e105b83b..bacc40a0bdf3 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -172,8 +172,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 
 /* Supports zoned block devices sequential write constraint */
 #define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
-/* Supports scheduling on multiple hardware queues */
-#define ELEVATOR_F_MQ_AWARE		(1U << 1)
 
 #endif /* CONFIG_BLOCK */
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index f364619092cc..a62ccbfac54b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -32,7 +32,6 @@ extern struct class block_class;
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
-#include <linux/xarray.h>
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
 /*
@@ -117,6 +116,13 @@ enum {
 	DISK_EVENT_FLAG_UEVENT			= 1 << 1,
 };
 
+struct disk_part_tbl {
+	struct rcu_head rcu_head;
+	int len;
+	struct block_device __rcu *last_lookup;
+	struct block_device __rcu *part[];
+};
+
 struct disk_events;
 struct badblocks;
 
@@ -142,7 +148,12 @@ struct gendisk {
 	unsigned short events;		/* supported events */
 	unsigned short event_flags;	/* flags related to event processing */
 
-	struct xarray part_tbl;
+	/* Array of pointers to partitions indexed by partno.
+	 * Protected with matching bdev lock but stat and other
+	 * non-critical accesses use RCU.  Always access through
+	 * helpers.
+	 */
+	struct disk_part_tbl __rcu *part_tbl;
 	struct block_device *part0;
 
 	const struct block_device_operations *fops;
@@ -202,11 +213,10 @@ static inline dev_t disk_devt(struct gendisk *disk)
 	return MKDEV(disk->major, disk->first_minor);
 }
 
-void disk_uevent(struct gendisk *disk, enum kobject_action action);
-
 /*
  * Smarter partition iterator without context limits.
  */
+#define DISK_PITER_REVERSE	(1 << 0) /* iterate in the reverse direction */
 #define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
 #define DISK_PITER_INCL_PART0	(1 << 2) /* include partition 0 */
 #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
@@ -214,7 +224,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action);
 struct disk_part_iter {
 	struct gendisk		*disk;
 	struct block_device	*part;
-	unsigned long		idx;
+	int			idx;
 	unsigned int		flags;
 };
 
@@ -222,6 +232,7 @@ extern void disk_part_iter_init(struct disk_part_iter *piter,
 				 struct gendisk *disk, unsigned int flags);
 struct block_device *disk_part_iter_next(struct disk_part_iter *piter);
 extern void disk_part_iter_exit(struct disk_part_iter *piter);
+extern bool disk_has_partitions(struct gendisk *disk);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3f1f7ae0fbe9..596bc2f4d9b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -468,6 +468,7 @@ extern int free_swap_and_cache(swp_entry_t);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
+extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d8ca336ff9cd..f0b2ccb1bb01 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -72,6 +72,8 @@
 	__start.bi_bvec_done = skip;			\
 	__start.bi_idx = 0;				\
 	for_each_bvec(__v, i->bvec, __bi, __start) {	\
+		if (!__v.bv_len)			\
+			continue;			\
 		(void)(STEP);				\
 	}						\
 }
@@ -1067,21 +1069,6 @@ static void pipe_advance(struct iov_iter *i, size_t size)
 	pipe_truncate(i);
 }
 
-static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
-{
-	struct bvec_iter bi;
-
-	bi.bi_size = i->count;
-	bi.bi_bvec_done = i->iov_offset;
-	bi.bi_idx = 0;
-	bvec_iter_advance(i->bvec, &bi, size);
-
-	i->bvec += bi.bi_idx;
-	i->nr_segs -= bi.bi_idx;
-	i->count = bi.bi_size;
-	i->iov_offset = bi.bi_bvec_done;
-}
-
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
 	if (unlikely(iov_iter_is_pipe(i))) {
@@ -1092,10 +1079,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		i->count -= size;
 		return;
 	}
-	if (iov_iter_is_bvec(i)) {
-		iov_iter_bvec_advance(i, size);
-		return;
-	}
 	iterate_and_advance(i, size, v, 0, 0, 0)
 }
 EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/page_io.c b/mm/page_io.c
index 92f7941c6d01..a75f35464a4e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,6 +26,25 @@
 #include <linux/uio.h>
 #include <linux/sched/task.h>
 
+static struct bio *get_swap_bio(gfp_t gfp_flags,
+				struct page *page, bio_end_io_t end_io)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, 1);
+	if (bio) {
+		struct block_device *bdev;
+
+		bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+		bio_set_dev(bio, bdev);
+		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
+		bio->bi_end_io = end_io;
+
+		bio_add_page(bio, page, thp_size(page), 0);
+	}
+	return bio;
+}
+
 void end_swap_bio_write(struct bio *bio)
 {
 	struct page *page = bio_first_page_all(bio);
@@ -342,13 +361,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		return 0;
 	}
 
-	bio = bio_alloc(GFP_NOIO, 1);
-	bio_set_dev(bio, sis->bdev);
-	bio->bi_iter.bi_sector = swap_page_sector(page);
+	bio = get_swap_bio(GFP_NOIO, page, end_write_func);
+	if (bio == NULL) {
+		set_page_dirty(page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
 	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
-	bio->bi_end_io = end_write_func;
-	bio_add_page(bio, page, thp_size(page), 0);
-
 	bio_associate_blkg_from_page(bio, page);
 	count_swpout_vm_event(page);
 	set_page_writeback(page);
@@ -408,18 +427,18 @@ int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	ret = 0;
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, sis->bdev);
-	bio->bi_opf = REQ_OP_READ;
-	bio->bi_iter.bi_sector = swap_page_sector(page);
-	bio->bi_end_io = end_swap_bio_read;
-	bio_add_page(bio, page, thp_size(page), 0);
-
+	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
+	if (bio == NULL) {
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
 	disk = bio->bi_bdev->bd_disk;
 	/*
 	 * Keep this task valid during swap readpage because the oom killer may
 	 * attempt to access it in the page fault retry time check.
 	 */
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 	if (synchronous) {
 		bio->bi_opf |= REQ_HIPRI;
 		get_task_struct(current);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 21a98cb8d646..9fffc5af29d1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,7 @@
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 				 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 
 DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -1849,13 +1850,12 @@ int find_first_swap(dev_t *device)
  */
 sector_t swapdev_block(int type, pgoff_t offset)
 {
+	struct block_device *bdev;
 	struct swap_info_struct *si = swap_type_to_swap_info(type);
-	struct swap_extent *se;
 
 	if (!si || !(si->flags & SWP_WRITEOK))
 		return 0;
-	se = offset_to_swap_extent(si, offset);
-	return se->start_block + (offset - se->start_page);
+	return map_swap_entry(swp_entry(type, offset), &bdev);
 }
 
 /*
@@ -2281,6 +2281,36 @@ static void drain_mmlist(void)
 	spin_unlock(&mmlist_lock);
 }
 
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
+ */
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
+{
+	struct swap_info_struct *sis;
+	struct swap_extent *se;
+	pgoff_t offset;
+
+	sis = swp_swap_info(entry);
+	*bdev = sis->bdev;
+
+	offset = swp_offset(entry);
+	se = offset_to_swap_extent(sis, offset);
+	return se->start_block + (offset - se->start_page);
+}
+
+/*
+ * Returns the page offset into bdev for the specified page's swap entry.
+ */
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
+{
+	swp_entry_t entry;
+	entry.val = page_private(page);
+	return map_swap_entry(entry, bdev);
+}
+
 /*
  * Free all of a swapdev's extent information
  */