From db04e18dbb0146d3c753dc05f7233350375bbc48 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Aug 2020 14:34:03 +0200 Subject: [PATCH 01/23] block: Make request_queue.rpm_status an enum request_queue.rpm_status is assigned values of the rpm_status enum only, so reflect that in its type. Note that including is (currently) a no-op, as it is already included through and , but it is better to play it safe. Signed-off-by: Geert Uytterhoeven Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb5636cc17b9..0a1730b30ad2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -458,7 +459,7 @@ struct request_queue { #ifdef CONFIG_PM struct device *dev; - int rpm_status; + enum rpm_status rpm_status; unsigned int nr_pending; #endif From 611bee526b4a89d49f1b9914a770bfdc101d5fb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Aug 2020 11:10:41 +0200 Subject: [PATCH 02/23] block: replace bd_set_size with bd_set_nr_sectors Replace bd_set_size with a version that takes the number of sectors instead, as that fits most of the current and future callers much better. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- drivers/block/nbd.c | 7 ++++--- drivers/block/pktcdvd.c | 2 +- drivers/nvme/host/nvme.h | 2 +- fs/block_dev.c | 10 +++++----- include/linux/genhd.h | 2 +- 6 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d3394191e168..cb1191d6e945 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -253,7 +253,7 @@ static void loop_set_size(struct loop_device *lo, loff_t size) { struct block_device *bdev = lo->lo_device; - bd_set_size(bdev, size << SECTOR_SHIFT); + bd_set_nr_sectors(bdev, size); set_capacity_revalidate_and_notify(lo->lo_disk, size, false); } @@ -1251,7 +1251,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { - bd_set_size(bdev, 0); + bd_set_nr_sectors(bdev, 0); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index edf8b632e3d2..a54f2d155a31 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -300,6 +300,7 @@ static void nbd_size_update(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; struct block_device *bdev = bdget_disk(nbd->disk, 0); + sector_t nr_sectors = config->bytesize >> 9; if (config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = config->blksize; @@ -308,10 +309,10 @@ static void nbd_size_update(struct nbd_device *nbd) } blk_queue_logical_block_size(nbd->disk->queue, config->blksize); blk_queue_physical_block_size(nbd->disk->queue, config->blksize); - set_capacity(nbd->disk, config->bytesize >> 9); + set_capacity(nbd->disk, nr_sectors); if (bdev) { if (bdev->bd_disk) { - bd_set_size(bdev, config->bytesize); + bd_set_nr_sectors(bdev, nr_sectors); set_blocksize(bdev, config->blksize); } else bdev->bd_invalidated = 1; @@ -1138,7 +1139,7 @@ static void nbd_bdev_reset(struct block_device *bdev) { if (bdev->bd_openers > 1) return; - bd_set_size(bdev, 0); + bd_set_nr_sectors(bdev, 0); } static void nbd_parse_flags(struct nbd_device *nbd) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 1034e445680c..17f2e6ff1223 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2192,7 +2192,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) set_capacity(pd->disk, lba << 2); set_capacity(pd->bdev->bd_disk, lba << 2); - bd_set_size(pd->bdev, (loff_t)lba << 11); + bd_set_nr_sectors(pd->bdev, lba << 2); q = bdev_get_queue(pd->bdev); if (write) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2910f6caab7d..aab130f31e25 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -688,7 +688,7 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk) struct block_device *bdev = bdget_disk(disk, 0); if (bdev) { - bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); + bd_set_nr_sectors(bdev, get_capacity(disk)); bdput(bdev); } } diff --git a/fs/block_dev.c b/fs/block_dev.c index 8ae833e00443..f52597172c8b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1371,13 +1371,13 @@ int check_disk_change(struct block_device *bdev) EXPORT_SYMBOL(check_disk_change); -void bd_set_size(struct block_device *bdev, loff_t size) +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { inode_lock(bdev->bd_inode); - i_size_write(bdev->bd_inode, size); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); inode_unlock(bdev->bd_inode); } -EXPORT_SYMBOL(bd_set_size); +EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); @@ -1514,7 +1514,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, } if (!ret) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } @@ -1542,7 +1542,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, ret = -ENXIO; goto out_clear; } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4ab853461dff..39025dc0397c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -375,7 +375,7 @@ void unregister_blkdev(unsigned int major, const char *name); int revalidate_disk(struct gendisk *disk); int check_disk_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_size(struct block_device *bdev, loff_t size); +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); From c2b4bb8cb3741c0bacf3683e4c1ecd04c977ada3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Aug 2020 11:10:42 +0200 Subject: [PATCH 03/23] block: fix locking for struct block_device size updates Two different callers use two different mutexes for updating the block device size, which obviously doesn't help to actually protect against concurrent updates from the different callers. In addition one of the locks, bd_mutex is rather prone to deadlocks with other parts of the block stack that use it for high level synchronization. Switch to using a new spinlock protecting just the size updates, as that is all we need, and make sure everyone does the update through the proper helper. This fixes a bug reported with the nvme revalidating disks during a hot removal operation, which can currently deadlock on bd_mutex. Reported-by: Xianting Tian Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/partitions/core.c | 4 ++-- drivers/block/aoe/aoecmd.c | 4 +--- drivers/md/dm.c | 15 ++------------- drivers/s390/block/dasd_ioctl.c | 9 ++------- fs/block_dev.c | 25 ++++++++++++++----------- include/linux/blk_types.h | 1 + 6 files changed, 22 insertions(+), 36 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 5b4869c08fb3..b1c0b50ca92d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -592,8 +592,8 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, (sector_t)length); - i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT); + part_nr_sects_write(part, length); + bd_set_nr_sectors(bdevp, length); ret = 0; out_unlock: diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 6dba41395155..313f0b946fe2 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -900,9 +900,7 @@ aoecmd_sleepwork(struct work_struct *work) ssize = get_capacity(d->gd); bd = bdget_disk(d->gd, 0); if (bd) { - inode_lock(bd->bd_inode); - i_size_write(bd->bd_inode, (loff_t)ssize<<9); - inode_unlock(bd->bd_inode); + bd_set_nr_sectors(bd, ssize); bdput(bd); } spin_lock_irq(&d->lock); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fb0255d25e4b..3dedd9cc4fb6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2097,18 +2097,6 @@ static void event_callback(void *context) dm_issue_global_event(); } -/* - * Protected by md->suspend_lock obtained by dm_swap_table(). - */ -static void __set_size(struct mapped_device *md, sector_t size) -{ - lockdep_assert_held(&md->suspend_lock); - - set_capacity(md->disk, size); - - i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -} - /* * Returns old map, which caller must destroy. */ @@ -2131,7 +2119,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); - __set_size(md, size); + set_capacity(md->disk, size); + bd_set_nr_sectors(md->bdev, size); dm_table_event_callback(t, event_callback, md); diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 777734d1b4e5..faaf5596e31c 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -55,10 +55,7 @@ dasd_ioctl_enable(struct block_device *bdev) dasd_enable_device(base); /* Formatting the dasd device can change the capacity. */ - mutex_lock(&bdev->bd_mutex); - i_size_write(bdev->bd_inode, - (loff_t)get_capacity(base->block->gdp) << 9); - mutex_unlock(&bdev->bd_mutex); + bd_set_nr_sectors(bdev, get_capacity(base->block->gdp)); dasd_put_device(base); return 0; } @@ -91,9 +88,7 @@ dasd_ioctl_disable(struct block_device *bdev) * Set i_size to zero, since read, write, etc. check against this * value. */ - mutex_lock(&bdev->bd_mutex); - i_size_write(bdev->bd_inode, 0); - mutex_unlock(&bdev->bd_mutex); + bd_set_nr_sectors(bdev, 0); dasd_put_device(base); return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index f52597172c8b..08158bb2e76c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -876,6 +876,7 @@ struct block_device *bdget(dev_t dev) bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { + spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; @@ -1290,6 +1291,7 @@ static void check_disk_size_change(struct gendisk *disk, { loff_t disk_size, bdev_size; + spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { @@ -1299,11 +1301,15 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size && __invalidate_device(bdev, false)) + } + bdev->bd_invalidated = 0; + spin_unlock(&bdev->bd_size_lock); + + if (bdev_size > disk_size) { + if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } - bdev->bd_invalidated = 0; } /** @@ -1328,13 +1334,10 @@ int revalidate_disk(struct gendisk *disk) if (!(disk->flags & GENHD_FL_HIDDEN)) { struct block_device *bdev = bdget_disk(disk, 0); - if (!bdev) - return ret; - - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev, ret == 0); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); + if (bdev) { + check_disk_size_change(disk, bdev, ret == 0); + bdput(bdev); + } } return ret; } @@ -1373,9 +1376,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { - inode_lock(bdev->bd_inode); + spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - inode_unlock(bdev->bd_inode); + spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(bd_set_nr_sectors); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4ecf4fed171f..5accc2549d22 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -38,6 +38,7 @@ struct block_device { /* number of times partitions within this device have been opened. */ unsigned bd_part_count; int bd_invalidated; + spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; From c13f0fbc4c191aab5e95b01589ff5bbc6556e4f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Aug 2020 11:10:43 +0200 Subject: [PATCH 04/23] nvme: don't call revalidate_disk from nvme_set_queue_dying In nvme_set_queue_dying we really just want to ensure the disk and bdev sizes are set to zero. Going through revalidate_disk leads to a somewhat arcance and complex callchain relying on special behavior in a few places. Instead just lift the set_capacity directly to nvme_set_queue_dying, and rename and move the nvme_mpath_update_disk_size helper so that we can use it in nvme_set_queue_dying to propagate the size to the bdev without detours. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 33 +++++++++++++++++++++++---------- drivers/nvme/host/nvme.h | 13 ------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d543bc1747fd..c7e01d9667ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -94,21 +94,34 @@ static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); +static void nvme_update_bdev_size(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + + if (bdev) { + bd_set_nr_sectors(bdev, get_capacity(disk)); + bdput(bdev); + } +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ static void nvme_set_queue_dying(struct nvme_ns *ns) { - /* - * Revalidating a dead namespace sets capacity to 0. This will end - * buffered writers dirtying pages that can't be synced. - */ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; + blk_set_queue_dying(ns->queue); - /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); - /* - * Revalidate after unblocking dispatchers that may be holding bd_butex - */ - revalidate_disk(ns->disk); + + set_capacity(ns->disk, 0); + nvme_update_bdev_size(ns->disk); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) @@ -2134,7 +2147,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); - nvme_mpath_update_disk_size(ns->head->disk); + nvme_update_bdev_size(ns->head->disk); } #endif return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index aab130f31e25..87737fa32360 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -683,16 +683,6 @@ static inline void nvme_trace_bio_complete(struct request *req, trace_block_bio_complete(ns->head->disk->queue, req->bio); } -static inline void nvme_mpath_update_disk_size(struct gendisk *disk) -{ - struct block_device *bdev = bdget_disk(disk, 0); - - if (bdev) { - bd_set_nr_sectors(bdev, get_capacity(disk)); - bdput(bdev); - } -} - extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -767,9 +757,6 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) { } -static inline void nvme_mpath_update_disk_size(struct gendisk *disk) -{ -} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_BLK_DEV_ZONED From f3256075ba49d80835b601bfbff350a2140b2924 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:45 +0200 Subject: [PATCH 05/23] block: remove the BIO_NULL_MAPPED flag We can simply use a boolean flag in the bio_map_data data structure instead. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 9 +++++---- include/linux/blk_types.h | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 6e804892d5ec..51e6195f878d 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -12,7 +12,8 @@ #include "blk.h" struct bio_map_data { - int is_our_pages; + bool is_our_pages : 1; + bool is_null_mapped : 1; struct iov_iter iter; struct iovec iov[]; }; @@ -108,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret = 0; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + if (!bmd || !bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free @@ -158,7 +159,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, * The caller provided iov might point to an on-stack or otherwise * shortlived one. */ - bmd->is_our_pages = map_data ? 0 : 1; + bmd->is_our_pages = !map_data; nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (nr_pages > BIO_MAX_PAGES) @@ -234,7 +235,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bio->bi_private = bmd; if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); + bmd->is_null_mapped = true; return bio; cleanup: if (!map_data) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5accc2549d22..78b073956884 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -257,7 +257,6 @@ enum { BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_USER_MAPPED, /* contains user pages */ - BIO_NULL_MAPPED, /* contains invalid user pages */ BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ From 7b63c052a580d7d5ed83a6e2a327e85881fa679a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:46 +0200 Subject: [PATCH 06/23] block: remove __blk_rq_unmap_user Open code __blk_rq_unmap_user in the two callers. Both never pass a NULL bio, and one of them can use an existing local variable instead of the bio flag. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 51e6195f878d..10de4809edf9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -558,20 +558,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) } EXPORT_SYMBOL(blk_rq_append_bio); -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - static int __blk_rq_map_user_iov(struct request *rq, struct rq_map_data *map_data, struct iov_iter *iter, gfp_t gfp_mask, bool copy) @@ -599,7 +585,10 @@ static int __blk_rq_map_user_iov(struct request *rq, */ ret = blk_rq_append_bio(rq, &bio); if (ret) { - __blk_rq_unmap_user(orig_bio); + if (copy) + bio_uncopy_user(orig_bio); + else + bio_unmap_user(orig_bio); return ret; } bio_get(bio); @@ -701,9 +690,13 @@ int blk_rq_unmap_user(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_BOUNCED))) mapped_bio = bio->bi_private; - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; + if (bio_flagged(mapped_bio, BIO_USER_MAPPED)) { + bio_unmap_user(mapped_bio); + } else { + ret2 = bio_uncopy_user(mapped_bio); + if (ret2 && !ret) + ret = ret2; + } mapped_bio = bio; bio = bio->bi_next; From 7589ad6729d6a707c6ce97693106d42cbb121b42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:47 +0200 Subject: [PATCH 07/23] block: remove __blk_rq_map_user_iov Just duplicate a small amount of code in the low-level map into the bio and copy to the bio routines, leading to much easier to follow and maintain code, and better shared error handling. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 144 ++++++++++++++++++------------------------------ 1 file changed, 54 insertions(+), 90 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 10de4809edf9..427962ac2f67 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -127,24 +127,12 @@ static int bio_uncopy_user(struct bio *bio) return ret; } -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -static struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask) +static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, + struct iov_iter *iter, gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; - struct bio *bio; + struct bio *bio, *bounce_bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; @@ -152,7 +140,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bmd = bio_alloc_map_data(iter, gfp_mask); if (!bmd) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* * We need to do a deep copy of the iov_iter including the iovecs. @@ -169,8 +157,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; - - ret = 0; + bio->bi_opf |= req_op(rq); if (map_data) { nr_pages = 1 << map_data->page_order; @@ -187,7 +174,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, if (map_data) { if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; - break; + goto cleanup; } page = map_data->pages[i / nr_pages]; @@ -195,14 +182,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, i++; } else { - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(rq->q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; - break; + goto cleanup; } } - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -212,9 +199,6 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, offset = 0; } - if (ret) - goto cleanup; - if (map_data) map_data->offset += bio->bi_iter.bi_size; @@ -236,39 +220,42 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bio->bi_private = bmd; if (map_data && map_data->null_mapped) bmd->is_null_mapped = true; - return bio; + + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto cleanup; + + /* + * We link the bounce buffer in and could have to traverse it later, so + * we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; cleanup: if (!map_data) bio_free_pages(bio); bio_put(bio); out_bmd: kfree(bmd); - return ERR_PTR(ret); + return ret; } -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, gfp_t gfp_mask) +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(q); - int j; - struct bio *bio; + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + struct bio *bio, *bounce_bio; int ret; + int j; if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); + return -EINVAL; bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); if (!bio) - return ERR_PTR(-ENOMEM); + return -ENOMEM; + bio->bi_opf |= req_op(rq); while (iov_iter_count(iter)) { struct page **pages; @@ -284,7 +271,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(q))) { + if (unlikely(offs & queue_dma_alignment(rq->q))) { ret = -EINVAL; j = 0; } else { @@ -296,7 +283,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, if (n > bytes) n = bytes; - if (!bio_add_hw_page(q, bio, page, n, offs, + if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) { if (same_page) put_page(page); @@ -323,18 +310,30 @@ static struct bio *bio_map_user_iov(struct request_queue *q, bio_set_flag(bio, BIO_USER_MAPPED); /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it + * Subtle: if we end up needing to bounce a bio, it would normally + * disappear when its bi_end_io is run. However, we need the original + * bio for the unmap, so grab an extra reference to it */ bio_get(bio); - return bio; + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto out_put_orig; + + /* + * We link the bounce buffer in and could have to traverse it + * later, so we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; + + out_put_orig: + bio_put(bio); out_unmap: bio_release_pages(bio, false); bio_put(bio); - return ERR_PTR(ret); + return ret; } /** @@ -558,44 +557,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) } EXPORT_SYMBOL(blk_rq_append_bio); -static int __blk_rq_map_user_iov(struct request *rq, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask, bool copy) -{ - struct request_queue *q = rq->q; - struct bio *bio, *orig_bio; - int ret; - - if (copy) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); - - orig_bio = bio; - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - ret = blk_rq_append_bio(rq, &bio); - if (ret) { - if (copy) - bio_uncopy_user(orig_bio); - else - bio_unmap_user(orig_bio); - return ret; - } - bio_get(bio); - - return 0; -} - /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted @@ -639,7 +600,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, i = *iter; do { - ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (copy) + ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); + else + ret = bio_map_user_iov(rq, &i, gfp_mask); if (ret) goto unmap_rq; if (!bio) From 3310eebafe6f9a872c1f757b3d822dafae9c0cd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:48 +0200 Subject: [PATCH 08/23] block: remove the BIO_USER_MAPPED flag Just check if there is private data, in which case the bio must have originated from bio_copy_user_iov. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 10 ++++------ include/linux/blk_types.h | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 427962ac2f67..be118926ccf4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -109,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret = 0; - if (!bmd || !bmd->is_null_mapped) { + if (!bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free @@ -307,8 +307,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - bio_set_flag(bio, BIO_USER_MAPPED); - /* * Subtle: if we end up needing to bounce a bio, it would normally * disappear when its bi_end_io is run. However, we need the original @@ -654,12 +652,12 @@ int blk_rq_unmap_user(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_BOUNCED))) mapped_bio = bio->bi_private; - if (bio_flagged(mapped_bio, BIO_USER_MAPPED)) { - bio_unmap_user(mapped_bio); - } else { + if (bio->bi_private) { ret2 = bio_uncopy_user(mapped_bio); if (ret2 && !ret) ret = ret2; + } else { + bio_unmap_user(mapped_bio); } mapped_bio = bio; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 78b073956884..63a39e47fc60 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -256,7 +256,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_USER_MAPPED, /* contains user pages */ BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ From c4823983538d5fdf38575b3f3ba3a0d10b7f021a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Aug 2020 09:35:33 +0200 Subject: [PATCH 09/23] raw: deprecate the raw driver The raw driver has been replaced by O_DIRECT support on the block device in 2002. Deprecate it to prepare for removal in a few kernel releases. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/char/raw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 380bf518338e..ccf5bd528642 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -63,6 +63,11 @@ static int raw_open(struct inode *inode, struct file *filp) return 0; } + pr_warn_ratelimited( + "process %s (pid %d) is using the deprecated raw device\n" + "support will be removed in Linux 5.14.\n", + current->comm, current->pid); + mutex_lock(&raw_mutex); /* From 4ce790632803bf2ec47271895a59936c70df7e78 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Fri, 21 Aug 2020 09:19:15 +0800 Subject: [PATCH 10/23] virtio-blk: Use kobj_to_dev() instead of container_of() Use kobj_to_dev() instead of container_of() Signed-off-by: Tian Tao Reviewed-by: Bart Van Assche Reviewed-by: Stefan Hajnoczi Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b2e48dac1ebd..ca63a41059d6 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -646,7 +646,7 @@ static struct attribute *virtblk_attrs[] = { static umode_t virtblk_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; struct virtio_device *vdev = vblk->vdev; From 339b5a25c28c1d85afd0ce882f57962ad550bc6a Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 29 Aug 2020 02:13:53 -0700 Subject: [PATCH 11/23] blk-wbt: Remove obsolete multiqueue I/O scheduling comment This comment was added before the multiqueue I/O scheduler framework was introduced; multiqueue has support for I/O scheduling now, so this obsolete comment can be removed. Signed-off-by: Danny Lin Signed-off-by: Jens Axboe --- block/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index bbad5e8bbffe..a2297edfdde8 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -161,8 +161,6 @@ config BLK_WBT_MQ depends on BLK_WBT help Enable writeback throttling by default on multiqueue devices. - Multiqueue currently doesn't have support for IO scheduling, - enabling this option is recommended. config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" From 8e756373d7c8eb6f8876411bed45e4b1c736eb53 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 28 Aug 2020 10:52:54 +0800 Subject: [PATCH 12/23] block: Move bio merge related functions into blk-merge.c It's better to move bio merge related functions into blk-merge.c, which contains all merge related functions. Reviewed-by: Christoph Hellwig Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-core.c | 156 --------------------------------------------- block/blk-merge.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 156 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 10c08ac50697..11661f5b64d6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -643,162 +643,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -static void blk_account_io_merge_bio(struct request *req) -{ - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); -} - -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_back_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_backmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_free_ctx(bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_front_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_frontmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - bio->bi_next = req->bio; - req->bio = bio; - - req->__sector = bio->bi_iter.bi_sector; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_do_front_merge(req, bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio) -{ - unsigned short segments = blk_rq_nr_discard_segments(req); - - if (segments >= queue_max_discard_segments(q)) - goto no_merge; - if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req, blk_rq_pos(req))) - goto no_merge; - - rq_qos_merge(q, req, bio); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - req->nr_phys_segments = segments + 1; - - blk_account_io_merge_bio(req); - return true; -no_merge: - req_set_nomerge(q, req); - return false; -} - -/** - * blk_attempt_plug_merge - try to merge with %current's plugged list - * @q: request_queue new bio is being queued at - * @bio: new bio being queued - * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) - * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, - * otherwise %false. - * - * Plugging coalesces IOs from the same issuer for the same purpose without - * going through @q->queue_lock. As such it's more of an issuing mechanism - * than scheduling, and the request, while may have elvpriv data, is not - * added on the elevator at this point. In addition, we don't have - * reliable access to the elevator outside queue lock. Only check basic - * merging parameters without querying the elevator. - * - * Caller must ensure !blk_queue_nomerges(q) beforehand. - */ -bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) -{ - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - - plug = blk_mq_plug(q, bio); - if (!plug) - return false; - - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - bool merged = false; - - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(rq, bio, nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(rq, bio, nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - break; - } - - if (merged) - return true; - } - - return false; -} - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; diff --git a/block/blk-merge.c b/block/blk-merge.c index f685d633bcc9..3aa2de57b57f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -11,6 +11,7 @@ #include #include "blk.h" +#include "blk-rq-qos.h" static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) @@ -895,3 +896,159 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) return ELEVATOR_FRONT_MERGE; return ELEVATOR_NO_MERGE; } + +static void blk_account_io_merge_bio(struct request *req) +{ + if (!blk_do_io_stat(req)) + return; + + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); +} + +bool bio_attempt_back_merge(struct request *req, struct bio *bio, + unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_back_merge_fn(req, bio, nr_segs)) + return false; + + trace_block_bio_backmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_free_ctx(bio); + + blk_account_io_merge_bio(req); + return true; +} + +bool bio_attempt_front_merge(struct request *req, struct bio *bio, + unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_front_merge_fn(req, bio, nr_segs)) + return false; + + trace_block_bio_frontmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + bio->bi_next = req->bio; + req->bio = bio; + + req->__sector = bio->bi_iter.bi_sector; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_do_front_merge(req, bio); + + blk_account_io_merge_bio(req); + return true; +} + +bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, + struct bio *bio) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + rq_qos_merge(q, req, bio); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->nr_phys_segments = segments + 1; + + blk_account_io_merge_bio(req); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + +/** + * blk_attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @nr_segs: number of segments in @bio + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. + */ +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **same_queue_rq) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + + plug = blk_mq_plug(q, bio); + if (!plug) + return false; + + plug_list = &plug->mq_list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { + bool merged = false; + + if (rq->q == q && same_queue_rq) { + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + *same_queue_rq = rq; + } + + if (rq->q != q || !blk_rq_merge_ok(rq, bio)) + continue; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + merged = bio_attempt_back_merge(rq, bio, nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + merged = bio_attempt_front_merge(rq, bio, nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + break; + } + + if (merged) + return true; + } + + return false; +} From bdc6a287bc98e8f32bf52c9cb2d1bdf75975f5a0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 28 Aug 2020 10:52:55 +0800 Subject: [PATCH 13/23] block: Move blk_mq_bio_list_merge() into blk-merge.c Move the blk_mq_bio_list_merge() into blk-merge.c and rename it as a generic name. Reviewed-by: Christoph Hellwig Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-merge.c | 44 ++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.c | 46 +----------------------------------------- block/blk.h | 2 ++ block/kyber-iosched.c | 2 +- include/linux/blk-mq.h | 2 -- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 3aa2de57b57f..b09e9fc44236 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -1052,3 +1052,47 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, return false; } + +/* + * Iterate list of requests and see if we can merge this bio with any + * of them. + */ +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, list, queuelist) { + bool merged = false; + + if (!checked--) + break; + + if (!blk_rq_merge_ok(rq, bio)) + continue; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_back_merge(rq, bio, + nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_front_merge(rq, bio, + nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + continue; + } + + return merged; + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_bio_list_merge); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d2790e5b06d1..82acff96c093 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -391,50 +391,6 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); -/* - * Iterate list of requests and see if we can merge this bio with any - * of them. - */ -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs) -{ - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - return merged; - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); - /* * Reverse check our software queue for entries that we could potentially * merge with. Currently includes a hand-wavy stop count of 8, to not spend @@ -449,7 +405,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q, lockdep_assert_held(&ctx->lock); - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { ctx->rq_merged++; return true; } diff --git a/block/blk.h b/block/blk.h index 49e2928a1632..d6152d20d4e2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -177,6 +177,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **same_queue_rq); +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs); void blk_account_io_start(struct request *req); void blk_account_io_done(struct request *req, u64 now); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a38c5ab103d1..6d4ba0e9688e 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -573,7 +573,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, bool merged; spin_lock(&kcq->lock); - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); spin_unlock(&kcq->lock); return merged; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9d2d5ad367a4..21a02e0577dd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -489,8 +489,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); From 7d7ca7c5269becab86c9f595309c8e90ce268967 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 28 Aug 2020 10:52:56 +0800 Subject: [PATCH 14/23] block: Add a new helper to attempt to merge a bio There are lots of duplicated code when trying to merge a bio from plug list and sw queue, we can introduce a new helper to attempt to merge a bio, which can simplify the blk_bio_list_merge() and blk_attempt_plug_merge(). Reviewed-by: Christoph Hellwig Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-merge.c | 104 ++++++++++++++++++++++--------------------- block/blk-mq-sched.c | 6 +-- block/blk.h | 21 ++++++--- 3 files changed, 71 insertions(+), 60 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b09e9fc44236..80c974484a3f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -907,13 +907,14 @@ static void blk_account_io_merge_bio(struct request *req) part_stat_unlock(); } -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(req, bio, nr_segs)) - return false; + return BIO_MERGE_FAILED; trace_block_bio_backmerge(req->q, req, bio); rq_qos_merge(req->q, req, bio); @@ -928,16 +929,17 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio, bio_crypt_free_ctx(bio); blk_account_io_merge_bio(req); - return true; + return BIO_MERGE_OK; } -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) +enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(req, bio, nr_segs)) - return false; + return BIO_MERGE_FAILED; trace_block_bio_frontmerge(req->q, req, bio); rq_qos_merge(req->q, req, bio); @@ -954,11 +956,12 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio, bio_crypt_do_front_merge(req, bio); blk_account_io_merge_bio(req); - return true; + return BIO_MERGE_OK; } -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio) +enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, + struct bio *bio) { unsigned short segments = blk_rq_nr_discard_segments(req); @@ -976,10 +979,39 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, req->nr_phys_segments = segments + 1; blk_account_io_merge_bio(req); - return true; + return BIO_MERGE_OK; no_merge: req_set_nomerge(q, req); - return false; + return BIO_MERGE_FAILED; +} + +static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, + struct request *rq, + struct bio *bio, + unsigned int nr_segs, + bool sched_allow_merge) +{ + if (!blk_rq_merge_ok(rq, bio)) + return BIO_MERGE_NONE; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!sched_allow_merge || + (sched_allow_merge && blk_mq_sched_allow_merge(q, rq, bio))) + return bio_attempt_back_merge(rq, bio, nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + if (!sched_allow_merge || + (sched_allow_merge && blk_mq_sched_allow_merge(q, rq, bio))) + return bio_attempt_front_merge(rq, bio, nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); + default: + return BIO_MERGE_NONE; + } + + return BIO_MERGE_FAILED; } /** @@ -1018,8 +1050,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, plug_list = &plug->mq_list; list_for_each_entry_reverse(rq, plug_list, queuelist) { - bool merged = false; - if (rq->q == q && same_queue_rq) { /* * Only blk-mq multiple hardware queues case checks the @@ -1029,24 +1059,11 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, *same_queue_rq = rq; } - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) + if (rq->q != q) continue; - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(rq, bio, nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(rq, bio, nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - break; - } - - if (merged) + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) return true; } @@ -1064,33 +1081,18 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, int checked = 8; list_for_each_entry_reverse(rq, list, queuelist) { - bool merged = false; - if (!checked--) break; - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: + switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) { + case BIO_MERGE_NONE: continue; + case BIO_MERGE_OK: + return true; + case BIO_MERGE_FAILED: + return false; } - return merged; } return false; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 82acff96c093..94db0c9d59b8 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -368,7 +368,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_BACK_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_back_merge(rq, bio, nr_segs)) + if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK) return false; *merged_request = attempt_back_merge(q, rq); if (!*merged_request) @@ -377,14 +377,14 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_FRONT_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_front_merge(rq, bio, nr_segs)) + if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK) return false; *merged_request = attempt_front_merge(q, rq); if (!*merged_request) elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; case ELEVATOR_DISCARD_MERGE: - return bio_attempt_discard_merge(q, rq, bio); + return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK; default: return false; } diff --git a/block/blk.h b/block/blk.h index d6152d20d4e2..a180443ac13f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,12 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; }; +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -169,12 +175,15 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio); +enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs); +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs); +enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, + struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **same_queue_rq); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, From cdfcef9ee87745d9511ad8825e1d2b8b861884da Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 28 Aug 2020 10:52:57 +0800 Subject: [PATCH 15/23] block: Remove blk_mq_attempt_merge() function The small blk_mq_attempt_merge() function is only called by __blk_mq_sched_bio_merge(), just open code it. Reviewed-by: Christoph Hellwig Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 94db0c9d59b8..205d9716f7a5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -391,28 +391,6 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct bio *bio, - unsigned int nr_segs) -{ - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { - ctx->rq_merged++; - return true; - } - - return false; -} - bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -426,14 +404,24 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return e->type->ops.bio_merge(hctx, bio, nr_segs); type = hctx->type; - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_lists[type])) { - /* default per sw-queue merge */ - spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); - spin_unlock(&ctx->lock); + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || + list_empty_careful(&ctx->rq_lists[type])) + return false; + + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + /* + * Reverse check our software queue for entries that we could + * potentially merge with. Currently includes a hand-wavy stop + * count of 8, to not spend too much time checking for merges. + */ + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { + ctx->rq_merged++; + ret = true; } + spin_unlock(&ctx->lock); + return ret; } From e44a6a2359808e0b97ff894569748ad2593b06f9 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Thu, 27 Aug 2020 14:34:17 +0800 Subject: [PATCH 16/23] blk-mq: use BLK_MQ_NO_TAG for no tag Replace various magic -1 constants for tags with BLK_MQ_NO_TAG. Signed-off-by: Xianting Tian Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/blk-mq-sched.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 11661f5b64d6..062efdedc994 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - rq->tag = -1; - rq->internal_tag = -1; + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 205d9716f7a5..501a85ceaccb 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -475,7 +475,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, goto run; } - WARN_ON(e && (rq->tag != -1)); + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { /* From 7b8917f5e29c377be1db5680249fe30e038cb3eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:33 +0200 Subject: [PATCH 17/23] block: remove the alignment_offset field from struct hd_struct The alignment offset is only used in slow path callers, so just calculate it on the fly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/core.c | 7 ++++--- include/linux/blkdev.h | 5 ++--- include/linux/genhd.h | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index b1c0b50ca92d..94c2fb39e602 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -199,7 +199,10 @@ static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); + + return sprintf(buf, "%u\n", + queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, + p->start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, @@ -405,8 +408,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); p->discard_alignment = queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a1730b30ad2..ba1f5f5e11c6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1456,10 +1456,9 @@ static inline int bdev_alignment_offset(struct block_device *bdev) if (q->limits.misaligned) return -1; - if (bdev != bdev->bd_contains) - return bdev->bd_part->alignment_offset; - + return queue_limit_alignment_offset(&q->limits, + bdev->bd_part->start_sect); return q->limits.alignment_offset; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 39025dc0397c..bfa411c80dbb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,7 +65,6 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; - sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; From 7cf34d97ab45203b975396393ded9d3867dfa8bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:34 +0200 Subject: [PATCH 18/23] block: remove the discard_alignment field from struct hd_struct The alignment offset is only used in slow path callers, so just calculate it on the fly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/core.c | 7 ++++--- include/linux/blkdev.h | 4 ++-- include/linux/genhd.h | 1 - 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 94c2fb39e602..e596a85bba1e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -209,7 +209,10 @@ static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); + + return sprintf(buf, "%u\n", + queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, + p->start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -408,8 +411,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba1f5f5e11c6..d0d61bc81615 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1498,8 +1498,8 @@ static inline int bdev_discard_alignment(struct block_device *bdev) struct request_queue *q = bdev_get_queue(bdev); if (bdev != bdev->bd_contains) - return bdev->bd_part->discard_alignment; - + return queue_limit_discard_alignment(&q->limits, + bdev->bd_part->start_sect); return q->limits.discard_alignment; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index bfa411c80dbb..9ea2ca31c278 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,7 +65,6 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; - unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; From 46d40cfad13ccbd0739019d754d46d8f93e1d5aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:35 +0200 Subject: [PATCH 19/23] block: remove an outdated comment on the bd_dev field kdev_t is long gone, so we don't need to comment a field isn't one.. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 63a39e47fc60..59d9150165c4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,7 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { - dev_t bd_dev; /* not a kdev_t - it's a search key */ + dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; From e5c7fb400227df5c7822a3c59b193d23e849d0ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:36 +0200 Subject: [PATCH 20/23] block: move the devcgroup_inode_permission call to blkdev_get devcgroup_inode_permission is never called for the recusive case, so move it out into blkdev_get. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 08158bb2e76c..990e97bcbeaf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1449,22 +1449,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, struct gendisk *disk; int ret; int partno; - int perm = 0; bool first_open = false, unblock_events = true, need_restart; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - if (!for_part) { - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) - return ret; - } - restart: need_restart = false; ret = -ENXIO; @@ -1637,12 +1623,24 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, */ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - int res; + int ret, perm = 0; - res =__blkdev_get(bdev, mode, holder, 0); - if (res) - bdput(bdev); - return res; + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret) + goto bdput; + + ret =__blkdev_get(bdev, mode, holder, 0); + if (ret) + goto bdput; + return 0; + +bdput: + bdput(bdev); + return ret; } EXPORT_SYMBOL(blkdev_get); From f93af2a494e1d28377065a2320d11da98110a970 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:37 +0200 Subject: [PATCH 21/23] block: cleanup __alloc_disk_node Use early returns and goto-based unwinding to simplify the flow a bit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 73 +++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 99c64641c314..055ce9cf1835 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1729,45 +1729,48 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); - if (disk) { - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) { - kfree(disk); - return NULL; - } - init_rwsem(&disk->lookup_sem); - disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - kfree(disk); - return NULL; - } - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); + if (!disk) + return NULL; - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); - if (hd_ref_init(&disk->part0)) { - hd_free_part(&disk->part0); - kfree(disk); - return NULL; - } + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) + goto out_free_disk; - disk->minors = minors; - rand_initialize_disk(disk); - disk_to_dev(disk)->class = &block_class; - disk_to_dev(disk)->type = &disk_type; - device_initialize(disk_to_dev(disk)); + init_rwsem(&disk->lookup_sem); + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_percpu(disk->part0.dkstats); + goto out_free_disk; } + + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); + + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + hd_sects_seq_init(&disk->part0); + if (hd_ref_init(&disk->part0)) + goto out_free_part0; + + disk->minors = minors; + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); return disk; + +out_free_part0: + hd_free_part(&disk->part0); +out_free_disk: + kfree(disk); + return NULL; } EXPORT_SYMBOL(__alloc_disk_node); From 8328eb28369a7dbfab6ff26366dbe8094425acc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:38 +0200 Subject: [PATCH 22/23] block: remove the disk argument to delete_partition We can trivially derive the gendisk from the hd_struct. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 2 +- block/partitions/core.c | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/block/blk.h b/block/blk.h index a180443ac13f..c08762e10b04 100644 --- a/block/blk.h +++ b/block/blk.h @@ -361,7 +361,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct gendisk *disk, struct hd_struct *part); +void delete_partition(struct hd_struct *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 055ce9cf1835..2055b5bf637a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -913,7 +913,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - delete_partition(disk, part); + delete_partition(part); } disk_part_iter_exit(&piter); diff --git a/block/partitions/core.c b/block/partitions/core.c index e596a85bba1e..dd6811422a87 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -324,8 +324,9 @@ int hd_ref_init(struct hd_struct *part) * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct gendisk *disk, struct hd_struct *part) +void delete_partition(struct hd_struct *part) { + struct gendisk *disk = part_to_disk(part); struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); @@ -333,7 +334,7 @@ void delete_partition(struct gendisk *disk, struct hd_struct *part) * ->part_tbl is referenced in this part's release handler, so * we have to hold the disk device */ - get_device(disk_to_dev(part_to_disk(part))); + get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -556,7 +557,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) sync_blockdev(bdevp); invalidate_bdev(bdevp); - delete_partition(bdev->bd_disk, part); + delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); @@ -636,7 +637,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(bdev->bd_disk, part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; From 1f06959bd2c96342dbac8e29994dd4f69deb956e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:39 +0200 Subject: [PATCH 23/23] block: remove the unused q argument to part_in_flight and part_in_flight_rw Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 2055b5bf637a..5fc6d82e6c68 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -110,8 +110,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct request_queue *q, - struct hd_struct *part) +static unsigned int part_in_flight(struct hd_struct *part) { unsigned int inflight = 0; int cpu; @@ -126,8 +125,7 @@ static unsigned int part_in_flight(struct request_queue *q, return inflight; } -static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) { int cpu; @@ -1301,7 +1299,7 @@ ssize_t part_stat_show(struct device *dev, if (queue_is_mq(q)) inflight = blk_mq_in_flight(q, p); else - inflight = part_in_flight(q, p); + inflight = part_in_flight(p); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1343,7 +1341,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, if (queue_is_mq(q)) blk_mq_in_flight_rw(q, p, inflight); else - part_in_flight_rw(q, p, inflight); + part_in_flight_rw(p, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1623,7 +1621,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(gp->queue, hd); + inflight = part_in_flight(hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u "