From 563d0d4c2c1dc1f3f84104c78b388d0490c0086f Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Fri, 1 May 2026 02:20:53 +0900 Subject: [PATCH] ntfs: wait for sync mft writes to complete ntfs_sync_mft_mirror() and write_mft_record_nolock() with @sync set are both documented as synchronous, but neither actually waits for the bio they submit nor inspects bi_status. write_inode() can return success while dirty mft record bytes are still in flight, and bio errors are silently dropped: the volume is not marked with errors and the inode is not redirtied. This breaks fsync()/sync metadata durability. Switch ntfs_sync_mft_mirror() and the @sync path of write_mft_record_nolock() to submit_bio_wait() and propagate the returned error to the caller. Capture ntfs_sync_mft_mirror()'s return value at its call sites in write_mft_record_nolock() so a mirror write failure surfaces too. The @sync parameter only controls the main MFT bio. The !@sync main submission is therefore unchanged and still uses ntfs_bio_end_io() to drop the folio reference taken before submission. The mirror call has always been documented as performing synchronous I/O regardless of @sync, so making it actually block restores the originally intended contract for both @sync and !@sync callers. Note this only fixes the synchronous mirror/main paths reachable from write_mft_record_nolock(). The main MFT write submitted from ntfs_write_mft_block() (the .writepages path) still does not wait for completion or check bi_status; that requires a larger restructuring and is left to a follow-up patch. Fixes: 115380f9a2f9 ("ntfs: update mft operations") Signed-off-by: DaeMyung Kang Reviewed-by: Hyunchul Lee Signed-off-by: Namjae Jeon --- fs/ntfs/mft.c | 63 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index f5186a19dffc..68f6fc8b7b62 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -449,7 +449,7 @@ static void ntfs_bio_end_io(struct bio *bio) int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, struct mft_record *m) { - u8 *kmirr = NULL; + u8 *kmirr; struct folio *folio; unsigned int folio_ofs, lcn_folio_off = 0; int err = 0; @@ -479,6 +479,7 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, kmirr = kmap_local_folio(folio, 0) + folio_ofs; /* Copy the mst protected mft record to the mirror. */ memcpy(kmirr, m, vol->mft_record_size); + kunmap_local(kmirr); if (vol->cluster_size_bits > PAGE_SHIFT) { lcn_folio_off = folio->index << PAGE_SHIFT; @@ -490,20 +491,22 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) + lcn_folio_off + folio_ofs); - if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) { + if (bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) + err = submit_bio_wait(bio); + else err = -EIO; - bio_put(bio); - goto unlock_folio; - } + bio_put(bio); - bio->bi_end_io = ntfs_bio_end_io; - submit_bio(bio); - /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* + * The in-memory mirror is now valid because we just memcpy()'d the + * mst-protected mft record into it. Mark the folio uptodate even on + * write error so a subsequent read_mapping_folio() does not refetch + * the stale on-disk mirror and overwrite this copy. The error is + * propagated to the caller via @err. + */ folio_mark_uptodate(folio); -unlock_folio: folio_unlock(folio); - kunmap_local(kmirr); folio_put(folio); if (likely(!err)) { ntfs_debug("Done."); @@ -588,20 +591,36 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn } /* Synchronize the mft mirror now if not @sync. */ - if (!sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + if (!sync && ni->mft_no < vol->mftmirr_size) { + int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, + fixup_m); + if (unlikely(sub_err) && !err) + err = sub_err; + } - folio_get(folio); - bio->bi_private = folio; - bio->bi_end_io = ntfs_bio_end_io; - submit_bio(bio); + if (sync) { + int sub_err = submit_bio_wait(bio); + + bio_put(bio); + if (unlikely(sub_err) && !err) + err = sub_err; + } else { + folio_get(folio); + bio->bi_private = folio; + bio->bi_end_io = ntfs_bio_end_io; + submit_bio(bio); + } offset += vol->cluster_size; i++; } /* If @sync, now synchronize the mft mirror. */ - if (sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + if (sync && ni->mft_no < vol->mftmirr_size) { + int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + + if (unlikely(sub_err) && !err) + err = sub_err; + } kunmap_local(kaddr); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ @@ -617,10 +636,10 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn bio_put(bio); err_out: /* - * Current state: all buffers are clean, unlocked, and uptodate. - * The caller should mark the base inode as bad so that no more i/o - * happens. ->drop_inode() will still be invoked so all extent inodes - * and other allocated memory will be freed. + * The caller should mark the base inode as bad so no more I/O + * happens. ->drop_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. ENOMEM is retried by + * redirtying the mft record below. */ if (err == -ENOMEM) { ntfs_error(vol->sb,