From 563d0d4c2c1dc1f3f84104c78b388d0490c0086f Mon Sep 17 00:00:00 2001
From: DaeMyung Kang <charsyam@gmail.com>
Date: Fri, 1 May 2026 02:20:53 +0900
Subject: [PATCH] ntfs: wait for sync mft writes to complete

ntfs_sync_mft_mirror() and write_mft_record_nolock() with @sync set
are both documented as synchronous, but neither actually waits for
the bio they submit nor inspects bi_status.  write_inode() can
return success while dirty mft record bytes are still in flight, and
bio errors are silently dropped: the volume is not marked with
errors and the inode is not redirtied.  This breaks fsync()/sync
metadata durability.

Switch ntfs_sync_mft_mirror() and the @sync path of
write_mft_record_nolock() to submit_bio_wait() and propagate the
returned error to the caller.  Capture ntfs_sync_mft_mirror()'s
return value at its call sites in write_mft_record_nolock() so a
mirror write failure surfaces too.

The @sync parameter only controls the main MFT bio.  The !@sync main
submission is therefore unchanged and still uses ntfs_bio_end_io() to
drop the folio reference taken before submission.  The mirror call
has always been documented as performing synchronous I/O regardless
of @sync, so making it actually block restores the originally
intended contract for both @sync and !@sync callers.

Note this only fixes the synchronous mirror/main paths reachable
from write_mft_record_nolock().  The main MFT write submitted from
ntfs_write_mft_block() (the .writepages path) still does not wait
for completion or check bi_status; that requires a larger
restructuring and is left to a follow-up patch.

Fixes: 115380f9a2f9 ("ntfs: update mft operations")
Signed-off-by: DaeMyung Kang <charsyam@gmail.com>
Reviewed-by: Hyunchul Lee <hyc.lee@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/ntfs/mft.c | 63 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index f5186a19dffc..68f6fc8b7b62 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -449,7 +449,7 @@ static void ntfs_bio_end_io(struct bio *bio)
 int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
 		struct mft_record *m)
 {
-	u8 *kmirr = NULL;
+	u8 *kmirr;
 	struct folio *folio;
 	unsigned int folio_ofs, lcn_folio_off = 0;
 	int err = 0;
@@ -479,6 +479,7 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
 	kmirr = kmap_local_folio(folio, 0) + folio_ofs;
 	/* Copy the mst protected mft record to the mirror. */
 	memcpy(kmirr, m, vol->mft_record_size);
+	kunmap_local(kmirr);
 
 	if (vol->cluster_size_bits > PAGE_SHIFT) {
 		lcn_folio_off = folio->index << PAGE_SHIFT;
@@ -490,20 +491,22 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
 		NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) +
 				 lcn_folio_off + folio_ofs);
 
-	if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) {
+	if (bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs))
+		err = submit_bio_wait(bio);
+	else
 		err = -EIO;
-		bio_put(bio);
-		goto unlock_folio;
-	}
+	bio_put(bio);
 
-	bio->bi_end_io = ntfs_bio_end_io;
-	submit_bio(bio);
-	/* Current state: all buffers are clean, unlocked, and uptodate. */
+	/*
+	 * The in-memory mirror is now valid because we just memcpy()'d the
+	 * mst-protected mft record into it.  Mark the folio uptodate even on
+	 * write error so a subsequent read_mapping_folio() does not refetch
+	 * the stale on-disk mirror and overwrite this copy.  The error is
+	 * propagated to the caller via @err.
+	 */
 	folio_mark_uptodate(folio);
 
-unlock_folio:
 	folio_unlock(folio);
-	kunmap_local(kmirr);
 	folio_put(folio);
 	if (likely(!err)) {
 		ntfs_debug("Done.");
@@ -588,20 +591,36 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn
 		}
 
 		/* Synchronize the mft mirror now if not @sync. */
-		if (!sync && ni->mft_no < vol->mftmirr_size)
-			ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+		if (!sync && ni->mft_no < vol->mftmirr_size) {
+			int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no,
+							   fixup_m);
+			if (unlikely(sub_err) && !err)
+				err = sub_err;
+		}
 
-		folio_get(folio);
-		bio->bi_private = folio;
-		bio->bi_end_io = ntfs_bio_end_io;
-		submit_bio(bio);
+		if (sync) {
+			int sub_err = submit_bio_wait(bio);
+
+			bio_put(bio);
+			if (unlikely(sub_err) && !err)
+				err = sub_err;
+		} else {
+			folio_get(folio);
+			bio->bi_private = folio;
+			bio->bi_end_io = ntfs_bio_end_io;
+			submit_bio(bio);
+		}
 		offset += vol->cluster_size;
 		i++;
 	}
 
 	/* If @sync, now synchronize the mft mirror. */
-	if (sync && ni->mft_no < vol->mftmirr_size)
-		ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+	if (sync && ni->mft_no < vol->mftmirr_size) {
+		int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+
+		if (unlikely(sub_err) && !err)
+			err = sub_err;
+	}
 	kunmap_local(kaddr);
 	if (unlikely(err)) {
 		/* I/O error during writing.  This is really bad! */
@@ -617,10 +636,10 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn
 	bio_put(bio);
 err_out:
 	/*
-	 * Current state: all buffers are clean, unlocked, and uptodate.
-	 * The caller should mark the base inode as bad so that no more i/o
-	 * happens.  ->drop_inode() will still be invoked so all extent inodes
-	 * and other allocated memory will be freed.
+	 * The caller should mark the base inode as bad so no more I/O
+	 * happens. ->drop_inode() will still be invoked so all extent inodes
+	 * and other allocated memory will be freed. ENOMEM is retried by
+	 * redirtying the mft record below.
 	 */
 	if (err == -ENOMEM) {
 		ntfs_error(vol->sb,