From 001397f5ef4908ea46a63059439e8c3bf3552d9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 31 Oct 2025 14:10:26 +0100 Subject: [PATCH 1/2] iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag Btrfs requires all of its bios to be fs block aligned, normally it's totally fine but with the incoming block size larger than page size (bs > ps) support, the requirement is no longer met for direct IOs. Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only requiring alignment to be bdev_logical_block_size(). In the real world that value is either 512 or 4K, on 4K page sized systems it means bio_iov_iter_get_pages() can break the bio at any page boundary, breaking btrfs' requirement for bs > ps cases. To address this problem, introduce a new public iomap dio flag, IOMAP_DIO_FSBLOCK_ALIGNED. When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will inherit that new flag, and iomap_dio_bio_iter() will take fs block size into the calculation of the alignment, and pass the alignment to bio_iov_iter_get_pages(), respecting the fs block size requirement. The initial user of this flag will be btrfs, which needs to calculate the checksum for direct read and thus requires the biovec to be fs block aligned for the incoming bs > ps support. Signed-off-by: Qu Wenruo Reviewed-by: Pankaj Raghav [hch: also align pos/len, incorporate the trace flags from Darrick] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 17 +++++++++++++++-- fs/iomap/trace.h | 7 ++++--- include/linux/iomap.h | 8 ++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e9e5f0703160..8b2f9fb89eb3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -336,8 +336,18 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); + + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { @@ -434,7 +444,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -639,6 +649,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index a61c1dae4742..532787277b16 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_DIO_STRINGS \ - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 65d123114883..8b1ac08c7474 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -553,6 +553,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); From 8caec6c9fef70c0d0ce1bf38ad343e18e5e1f6a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 Oct 2025 14:10:27 +0100 Subject: [PATCH 2/2] xfs: support sub-block aligned vectors in always COW mode Now that the block layer and iomap have grown support to indicate the bio sector size explicitly instead of assuming the device sector size, we can ask for logical block size alignment and thus support direct I/O writes where the overall size is logical block size aligned, but the boundaries between vectors might not be. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251031131045.1613229-3-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_file.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b9864c8582e..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -676,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -695,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -892,15 +901,7 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from);