From c50105933f0c75aacc4f95c9bf36f7fbd9a83884 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:39:59 +0100 Subject: [PATCH 01/11] iomap: allow the file system to submit the writeback bios Change ->prepare_ioend to ->submit_ioend and require file systems that implement it to submit the bio. This is needed for file systems that do their own work on the bios before submitting them to the block layer like btrfs or zoned xfs. To make this easier also pass the writeback context to the method. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 11 +++++------ fs/iomap/buffered-io.c | 10 +++++----- fs/xfs/xfs_aops.c | 13 +++++++++---- include/linux/iomap.h | 12 +++++++----- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..04fc7a49067d 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows: struct iomap_writeback_ops { int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); void (*discard_folio)(struct folio *folio, loff_t pos); }; @@ -306,13 +306,12 @@ The fields are as follows: purpose. This function must be supplied by the filesystem. - - ``prepare_ioend``: Enables filesystems to transform the writeback - ioend or perform any other preparatory work before the writeback I/O - is submitted. + - ``submit_ioend``: Allows the file systems to hook into writeback bio + submission. This might include pre-write space accounting updates, or installing a custom ``->bi_end_io`` function for internal purposes, such as deferring the ioend completion to a workqueue to run metadata update - transactions from process context. + transactions from process context before submitting the bio. This function is optional. - ``discard_folio``: iomap calls this function after ``->map_blocks`` @@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the storage device. Filesystems that need to update internal bookkeeping (e.g. unwritten -extent conversions) should provide a ``->prepare_ioend`` function to +extent conversions) should provide a ``->submit_ioend`` function to set ``struct iomap_end::bio::bi_end_io`` to its own function. This function should call ``iomap_finish_ioends`` after finishing its own work (e.g. unwritten extent conversion). diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..7952bf004bdb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1675,7 +1675,7 @@ static void iomap_writepage_end_bio(struct bio *bio) } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1694,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) + error = wpc->ops->submit_ioend(wpc, error); + else if (!error) + submit_bio(&wpc->ioend->io_bio); if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11..aa88895673d8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -395,10 +395,11 @@ xfs_map_blocks( } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -420,7 +421,11 @@ xfs_prepare_ioend( if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -462,7 +467,7 @@ xfs_discard_folio( static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, .discard_folio = xfs_discard_folio, }; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..dc8df4f779d4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -362,12 +362,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where From 710273330663241d9ca5fbed51909e65807556ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:00 +0100 Subject: [PATCH 02/11] iomap: simplify io_flags and io_type in struct iomap_ioend The ioend fields for distinct types of I/O are a bit complicated. Consolidate them into a single io_flag field with it's own flags decoupled from the iomap flags. This also prepares for adding a new flag that is unrelated to both of the iomap namespaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++----------------- fs/xfs/xfs_aops.c | 12 ++++++------ include/linux/iomap.h | 20 ++++++++++++++++++-- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7952bf004bdb..d8d271107e60 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1605,13 +1605,10 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; - if (next->io_flags & IOMAP_F_BOUNDARY) + if (next->io_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; @@ -1709,7 +1706,8 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { struct iomap_ioend *ioend; struct bio *bio; @@ -1724,8 +1722,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; + ioend->io_flags = ioend_flags; if (pos > wpc->iomap.offset) wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; ioend->io_inode = inode; @@ -1737,14 +1734,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, return ioend; } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) - return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; @@ -1779,14 +1775,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index aa88895673d8..8e60ceeb1520 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,7 +114,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); @@ -125,9 +125,9 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); if (!error && xfs_ioend_is_append(ioend)) @@ -410,7 +410,7 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,8 +418,8 @@ xfs_submit_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; if (status) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index dc8df4f779d4..9583f6456165 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ From 034c29fb3e7c119c42e650986e280f025a1bec7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:01 +0100 Subject: [PATCH 03/11] iomap: add a IOMAP_F_ANON_WRITE flag Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not have a target block assigned to it yet at iomap time and the file system will do that in the bio submission handler, splitting the I/O as needed. This is used to implement Zone Append based I/O for zoned XFS, where splitting writes to the hardware limits and assigning a zone to them happens just before sending the I/O off to the block layer, but could also be useful for other things like compressed I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-4-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 4 ++++ fs/iomap/buffered-io.c | 13 +++++++++---- fs/iomap/direct-io.c | 6 ++++-- include/linux/iomap.h | 7 +++++++ 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..28ab3758c474 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d8d271107e60..ba795d72e546 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1691,10 +1691,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->submit_ioend) + if (wpc->ops->submit_ioend) { error = wpc->ops->submit_ioend(wpc, error); - else if (!error) - submit_bio(&wpc->ioend->io_bio); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); @@ -1744,7 +1748,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..641649a04614 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,10 +81,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9583f6456165..eb0764945b42 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } From 5fcbd555d48390a8c819ba7fdf55fbfcabe05c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:02 +0100 Subject: [PATCH 04/11] iomap: split bios to zone append limits in the submission handlers Provide helpers for file systems to split bios in the direct I/O and writeback I/O submission handlers. The split ioends are chained to the parent ioend so that only the parent ioend originally generated by the iomap layer will be processed after all the chained off children have completed. This is based on the block layer bio chaining that has supported a similar mechanism for a long time. This Follows btrfs' lead and don't try to build bios to hardware limits for zone append commands, but instead build them as normal unconstrained bios and split them to the hardware limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-5-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/Makefile | 1 + fs/iomap/buffered-io.c | 49 ++++++++++++++---------- fs/iomap/ioend.c | 86 ++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 15 +++++++- 4 files changed, 130 insertions(+), 21 deletions(-) create mode 100644 fs/iomap/ioend.c diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ba795d72e546..f67e13a9807a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -40,7 +40,8 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) @@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * ioend after this. */ static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } +static u32 +iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + return iomap_finish_ioend_buffered(ioend); +} + /* * Ioend completion routine for merged bios. This can only be called from task * contexts as merged ioends can be of unbound length. Hence we have to break up @@ -1667,8 +1686,10 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* @@ -1713,7 +1734,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, loff_t pos, u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1721,21 +1741,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_flags = ioend_flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..3ff38c665c31 --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eb0764945b42..90c27875e39d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -353,12 +353,19 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, /* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend can split a bio generated + * by iomap. In that case the parent ioend it was split from is recorded + * in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ @@ -408,6 +415,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -479,4 +490,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ From 63b66913d11c5f3572dfdee38e78d510d0f90aa8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:03 +0100 Subject: [PATCH 05/11] iomap: move common ioend code to ioend.c This code will be reused for direct I/O soon, so split it out of buffered-io.c. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-6-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 135 +---------------------------------------- fs/iomap/internal.h | 9 +++ fs/iomap/ioend.c | 127 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 133 deletions(-) create mode 100644 fs/iomap/internal.h diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f67e13a9807a..4abff64998fe 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include #include #include -#include #include #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,9 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -struct bio_set iomap_ioend_bioset; -EXPORT_SYMBOL_GPL(iomap_ioend_bioset); - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -1539,8 +1534,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend_buffered(struct iomap_ioend *ioend) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; @@ -1567,123 +1561,6 @@ iomap_finish_ioend_buffered(struct iomap_ioend *ioend) return folio_count; } -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) -{ - if (ioend->io_parent) { - struct bio *bio = &ioend->io_bio; - - ioend = ioend->io_parent; - bio_put(bio); - } - - if (error) - cmpxchg(&ioend->io_error, 0, error); - - if (!atomic_dec_and_test(&ioend->io_remaining)) - return 0; - return iomap_finish_ioend_buffered(ioend); -} - -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_IOEND_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != - (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); @@ -2081,11 +1958,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..36d5c56e073e --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 3ff38c665c31..97d43c50cdf7 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -3,6 +3,11 @@ * Copyright (c) 2024-2025 Christoph Hellwig. */ #include +#include +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, loff_t file_offset, u16 ioend_flags) @@ -22,6 +27,120 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, } EXPORT_SYMBOL_GPL(iomap_init_ioend); +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + /* * Split up to the first @max_len bytes from @ioend if the ioend covers more * than @max_len bytes. @@ -84,3 +203,11 @@ struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, return split_ioend; } EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); From ae2f33a519af3730cacd1c787ebe1f7475df5ba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:04 +0100 Subject: [PATCH 06/11] iomap: factor out a iomap_dio_done helper Split out the struct iomap-dio level final completion from iomap_dio_bio_end_io into a helper to clean up the code and make it reusable. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-7-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 78 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 641649a04614..277ece243770 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -165,43 +165,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -219,19 +207,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; - } + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { From e523f2d4c974a819730830ce1c38834ee0cd7318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:05 +0100 Subject: [PATCH 07/11] iomap: optionally use ioends for direct I/O struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-8-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 48 +++++++++++++++++++++++++++++++++++++++++-- fs/iomap/internal.h | 1 + fs/iomap/ioend.c | 2 ++ include/linux/iomap.h | 10 +++++---- 4 files changed, 55 insertions(+), 6 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 277ece243770..138d246ec29d 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include #include @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -119,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -241,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio) } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index 36d5c56e073e..f6992a3bf66a 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -5,5 +5,6 @@ #define IOEND_BATCH_SIZE 4096 u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 97d43c50cdf7..44f254ecab55 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -41,6 +41,8 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); return iomap_finish_ioend_buffered(ioend); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 90c27875e39d..5768b9f2a1cc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,20 +343,22 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. * - * File systems implementing ->submit_ioend can split a bio generated - * by iomap. In that case the parent ioend it was split from is recorded - * in ioend->io_parent. + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ From d06244c60aec1d5d1589efe6b611a5b91a49465c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:06 +0100 Subject: [PATCH 08/11] iomap: add a io_private field to struct iomap_ioend Add a private data field to struct iomap_ioend so that the file system can attach information to it. Zoned XFS will use this for a pointer to the open zone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-9-hch@lst.de Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 1 + include/linux/iomap.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 44f254ecab55..18894ebba6db 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -23,6 +23,7 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, ioend->io_offset = file_offset; ioend->io_size = bio->bi_iter.bi_size; ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; return ioend; } EXPORT_SYMBOL_GPL(iomap_init_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5768b9f2a1cc..b4be07e8ec94 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -370,6 +370,7 @@ struct iomap_ioend { struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; From 02b39c4655d52141e07e80e9b2772d96daf67ff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:07 +0100 Subject: [PATCH 09/11] iomap: pass private data to iomap_page_mkwrite Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-10-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 +++- fs/xfs/xfs_file.c | 3 ++- fs/zonefs/file.c | 2 +- include/linux/iomap.h | 5 ++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4abff64998fe..8c24d8611edf 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1489,11 +1489,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, return length; } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..785f6bbf1406 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1511,7 +1511,8 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + NULL); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b4be07e8ec94..d528eb4d5cfe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, From c6d1b8d15450cf061648d4e36d622da9d755654a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:08 +0100 Subject: [PATCH 10/11] iomap: pass private data to iomap_zero_range Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-11-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/gfs2/bmap.c | 3 ++- fs/iomap/buffered-io.c | 6 ++++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8c24d8611edf..382647fda1d1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1391,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1465,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + NULL); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c..3410c55f544a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1497,7 +1497,7 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } int diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d528eb4d5cfe..eddf524ac749 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, From ddd402bbbf669c4ada106fd2e4c799e2b5745e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:09 +0100 Subject: [PATCH 11/11] iomap: pass private data to iomap_truncate_page Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-12-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 ++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 382647fda1d1..3458f97d1b1e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1467,7 +1467,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, if (!off) return 0; return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, - NULL); + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3410c55f544a..5dd0922fe2d1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1512,5 +1512,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eddf524ac749..022d7f338c68 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,