diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index deed823eab01..dc7eab191caa 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -238,6 +238,16 @@ All md devices contain: the number of devices in a raid4/5/6, or to support external metadata formats which mandate such clipping. + logical_block_size + Configure the array's logical block size in bytes. This attribute + is only supported for 1.x meta. Write the value before starting + array. The final array LBS uses the maximum between this + configuration and LBS of all combined devices. Note that + LBS cannot exceed PAGE_SIZE before RAID supports folio. + WARNING: Arrays created on new kernel cannot be assembled at old + kernel due to padding check, Set module parameter 'check_new_feature' + to false to bypass, but data loss may occur. + reshape_position This is either ``none`` or a sector number within the devices of the array where ``reshape`` is up to. If this is set, the three diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 25a6ddedea65..8d7b82c4a723 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -72,6 +72,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9676e2477df6..7b5c5967568f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1999,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; @@ -2208,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else @@ -5936,6 +5938,68 @@ static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); +static int mddev_set_logical_block_size(struct mddev *mddev, + unsigned int lbs) +{ + int err = 0; + struct queue_limits lim; + + if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { + pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", + mdname(mddev), lbs); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.logical_block_size = lbs; + pr_info("%s: logical_block_size is changed, data may be lost\n", + mdname(mddev)); + err = queue_limits_commit_update(mddev->gendisk->queue, &lim); + if (err) + return err; + + mddev->logical_block_size = lbs; + /* New lbs will be written to superblock after array is running */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return 0; +} + +static ssize_t +lbs_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%u\n", mddev->logical_block_size); +} + +static ssize_t +lbs_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int lbs; + int err = -EBUSY; + + /* Only 1.x meta supports configurable LBS */ + if (mddev->major_version == 0) + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + + err = kstrtouint(buf, 10, &lbs); + if (err < 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + goto unlock; + + err = mddev_set_logical_block_size(mddev, lbs); + +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_logical_block_size = +__ATTR(logical_block_size, 0644, lbs_show, lbs_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -5958,6 +6022,7 @@ static struct attribute *md_default_attrs[] = { &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, + &md_logical_block_size.attr, NULL, }; @@ -6088,6 +6153,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, return -EINVAL; } + /* + * Before RAID adding folio support, the logical_block_size + * should be smaller than the page size. + */ + if (lim->logical_block_size > PAGE_SIZE) { + pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", + mdname(mddev)); + return -EINVAL; + } + mddev->logical_block_size = lim->logical_block_size; + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -6699,6 +6775,7 @@ static void md_clean(struct mddev *mddev) mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; + mddev->logical_block_size = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index fd6e001c1d38..6985f2829bbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -433,6 +433,7 @@ struct mddev { sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ + unsigned int logical_block_size; __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index fbf763401521..47aee1b1d4d1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -380,6 +380,7 @@ static int raid0_set_limits(struct mddev *mddev) lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 592a40233004..57d50465eed1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3213,6 +3213,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 14dcd5142eb4..84be4cc7e873 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4000,6 +4000,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24b32a0c95b4..cdbc7eba5c54 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7745,6 +7745,7 @@ static int raid5_set_limits(struct mddev *mddev) stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); md_init_stacking_limits(&lim); + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ac74133a4768..310068bb2a1d 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -291,7 +291,8 @@ struct mdp_superblock_1 { __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ __le32 sb_csum; /* checksum up to devs[max_dev] */ __le32 max_dev; /* size of devs[] array to consider */ - __u8 pad3[64-32]; /* set to 0 when writing */ + __le32 logical_block_size; /* same as q->limits->logical_block_size */ + __u8 pad3[64-36]; /* set to 0 when writing */ /* device state information. Indexed by dev_number. * 2 bytes per device