From 9f346f7d4ea73692b82f5102ca8698e4040469ea Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 27 May 2025 16:14:07 +0800 Subject: [PATCH 1/5] md/raid1,raid10: don't handle IO error for REQ_RAHEAD and REQ_NOWAIT IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium is fine, hence record badblocks or remove the disk from array does not make sense. This problem if found by lvm2 test lvcreate-large-raid, where dm-zero will fail read ahead IO directly. Fixes: e879a0d9cb08 ("md/raid1,raid10: don't ignore IO flags") Reported-and-tested-by: Mikulas Patocka Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@redhat.com/ Link: https://lore.kernel.org/linux-raid/20250527081407.3004055-1-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/raid1-10.c | 10 ++++++++++ drivers/md/raid1.c | 19 ++++++++++--------- drivers/md/raid10.c | 11 ++++++----- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index c7efd8aab675..b8b3a9069701 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev, return false; } + +/* + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is + * submitted to the underlying disks, hence don't record badblocks or retry + * in this case. + */ +static inline bool raid1_should_handle_error(struct bio *bio) +{ + return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT)); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 657d481525be..19c5a0ce5a40 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio) */ update_head_pos(r1_bio->read_disk, r1_bio); - if (uptodate) + if (uptodate) { set_bit(R1BIO_Uptodate, &r1_bio->state); - else if (test_bit(FailFast, &rdev->flags) && - test_bit(R1BIO_FailFast, &r1_bio->state)) + } else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) { /* This was a fail-fast read so we definitely * want to retry */ ; - else { + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; + } else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio) struct bio *to_put = NULL; int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; - bool discard_error; sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && - !discard_error) { + !ignore_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index dce06bf65016..b74780af4c22 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; } else { /* If all other devices that store this block have * failed, we want to return the error upwards rather @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio) int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; - bool discard_error; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors) && - !discard_error) { + !ignore_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; From 2afe17794cfed5f80295b1b9facd66e6f65e5002 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 May 2025 14:13:10 +0800 Subject: [PATCH 2/5] md/md-bitmap: fix dm-raid max_write_behind setting It's supposed to be COUNTER_MAX / 2, not COUNTER_MAX. Link: https://lore.kernel.org/linux-raid/20250524061320.370630-14-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- drivers/md/md-bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 37b08f26c62f..45dd3d9f01a8 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -789,7 +789,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. */ write_behind = bitmap->mddev->bitmap_info.max_write_behind; - if (write_behind > COUNTER_MAX) + if (write_behind > COUNTER_MAX / 2) write_behind = COUNTER_MAX / 2; sb->write_behind = cpu_to_le32(write_behind); bitmap->mddev->bitmap_info.max_write_behind = write_behind; From b886475804230cad8075e1ed8c7fb2333f7c41c1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 May 2025 14:13:11 +0800 Subject: [PATCH 3/5] md/dm-raid: remove max_write_behind setting limit The comments said 'vaule in kB', while the value actually means the number of write_behind IOs. And since md-bitmap will automatically adjust the value to max COUNTER_MAX / 2, there is no need to fail early. Also move some macros that is only used md-bitmap.c. Link: https://lore.kernel.org/linux-raid/20250524061320.370630-15-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Xiao Ni --- drivers/md/dm-raid.c | 6 +----- drivers/md/md-bitmap.c | 10 ++++++++++ drivers/md/md-bitmap.h | 9 --------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 127138c61be5..d296770478b2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1356,11 +1356,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } - /* - * In device-mapper, we specify things in sectors, but - * MD records this value in kB - */ - if (value < 0 || value / 2 > COUNTER_MAX) { + if (value < 0) { rs->ti->error = "Max write-behind limit out of range"; return -EINVAL; } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 45dd3d9f01a8..931623b4d4a6 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -105,9 +105,19 @@ * */ +typedef __u16 bitmap_counter_t; + #define PAGE_BITS (PAGE_SIZE << 3) #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) + #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 31c93019c76b..08a9dfb69673 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,15 +9,6 @@ #define BITMAP_MAGIC 0x6d746962 -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) - /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ From 38f520a37d541a52ce16437a379824fabcd768da Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 May 2025 14:13:00 +0800 Subject: [PATCH 4/5] md/md-bitmap: cleanup bitmap_ops->startwrite() bitmap_startwrite() always return 0, and the caller doesn't check return value as well, hence change the method to void. Also rename startwrite/endwrite to start_write/end_write, which is more in line with the usual naming convention. Link: https://lore.kernel.org/linux-raid/20250524061320.370630-4-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- drivers/md/md-bitmap.c | 17 ++++++++--------- drivers/md/md-bitmap.h | 6 +++--- drivers/md/md.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 931623b4d4a6..e415219d1f23 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1682,13 +1682,13 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -static int bitmap_startwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors) +static void bitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) - return 0; + return; while (sectors) { sector_t blocks; @@ -1698,7 +1698,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); - return 0; + return; } if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { @@ -1734,11 +1734,10 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, else sectors = 0; } - return 0; } -static void bitmap_endwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors) +static void bitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; @@ -3013,8 +3012,8 @@ static struct bitmap_operations bitmap_ops = { .end_behind_write = bitmap_end_behind_write, .wait_behind_writes = bitmap_wait_behind_writes, - .startwrite = bitmap_startwrite, - .endwrite = bitmap_endwrite, + .start_write = bitmap_start_write, + .end_write = bitmap_end_write, .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 08a9dfb69673..96f09d620f33 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -80,10 +80,10 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - int (*startwrite)(struct mddev *mddev, sector_t offset, + void (*start_write)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + void (*end_write)(struct mddev *mddev, sector_t offset, unsigned long sectors); - void (*endwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0fde115e921f..d0d9144ced5b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8799,14 +8799,14 @@ static void md_bitmap_start(struct mddev *mddev, mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) From 01bf468c4e086b2f52a0c9dfa677c6016fc915ae Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 May 2025 14:13:02 +0800 Subject: [PATCH 5/5] md/md-bitmap: remove parameter slot from bitmap_create() All callers pass in '-1' for 'slot', hence it can be removed. Link: https://lore.kernel.org/linux-raid/20250524061320.370630-6-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Xiao Ni Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e415219d1f23..bd694910b01b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2214,9 +2214,9 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -static int bitmap_create(struct mddev *mddev, int slot) +static int bitmap_create(struct mddev *mddev) { - struct bitmap *bitmap = __bitmap_create(mddev, slot); + struct bitmap *bitmap = __bitmap_create(mddev, -1); if (IS_ERR(bitmap)) return PTR_ERR(bitmap); @@ -2679,7 +2679,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - rv = bitmap_create(mddev, -1); + rv = bitmap_create(mddev); if (rv) goto out; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 96f09d620f33..59e9dd45cfde 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -63,7 +63,7 @@ struct md_bitmap_stats { struct bitmap_operations { bool (*enabled)(struct mddev *mddev); - int (*create)(struct mddev *mddev, int slot); + int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, bool init); diff --git a/drivers/md/md.c b/drivers/md/md.c index d0d9144ced5b..09042b060086 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6225,7 +6225,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -7285,7 +7285,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); @@ -7601,7 +7601,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev, -1); + rv = mddev->bitmap_ops->create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev);