linux/fs/btrfs/reflink.c

// SPDX-License-Identifier: GPL-2.0

#include <linux/blkdev.h>
#include <linux/fscrypt.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "compression.h"
#include "delalloc-space.h"
#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "super.h"

#define BTRFS_MAX_DEDUPE_LEN	SZ_16M

static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				     struct inode *inode,
				     u64 endoff,
				     const u64 destoff,
				     const u64 olen,
				     bool no_time_update)
{
	int ret;

	inode_inc_iversion(inode);
	if (!no_time_update) {
		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
	}
	/*
	 * We round up to the block size at eof when determining which
	 * extents to clone above, but shouldn't round up the file size.
	 */
	if (endoff > destoff + olen)
		endoff = destoff + olen;
	if (endoff > inode->i_size) {
		i_size_write(inode, endoff);
		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
	}

	ret = btrfs_update_inode(trans, BTRFS_I(inode));
	if (unlikely(ret)) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
		return ret;
	}
	return btrfs_end_transaction(trans);
}

static int copy_inline_to_page(struct btrfs_inode *inode,
			       const u64 file_offset,
			       char *inline_data,
			       const u64 size,
			       const u64 datal,
			       const u8 comp_type)
{
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	const u32 block_size = fs_info->sectorsize;
	const u64 range_end = file_offset + block_size - 1;
	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
	struct extent_changeset *data_reserved = NULL;
	struct folio *folio = NULL;
	struct address_space *mapping = inode->vfs_inode.i_mapping;
	int ret;

	ASSERT(IS_ALIGNED(file_offset, block_size));

	/*
	 * We have flushed and locked the ranges of the source and destination
	 * inodes, we also have locked the inodes, so we are safe to do a
	 * reservation here. Also we must not do the reservation while holding
	 * a transaction open, otherwise we would deadlock.
	 */
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
					   block_size);
	if (ret)
		goto out;

	folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
					FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
					btrfs_alloc_write_mask(mapping));
	if (IS_ERR(folio)) {
		ret = PTR_ERR(folio);
		goto out_unlock;
	}

	ret = set_folio_extent_mapped(folio);
	if (ret < 0)
		goto out_unlock;

	btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
	ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
	if (ret)
		goto out_unlock;

	/*
	 * After dirtying the page our caller will need to start a transaction,
	 * and if we are low on metadata free space, that can cause flushing of
	 * delalloc for all inodes in order to get metadata space released.
	 * However we are holding the range locked for the whole duration of
	 * the clone/dedupe operation, so we may deadlock if that happens and no
	 * other task releases enough space. So mark this inode as not being
	 * possible to flush to avoid such deadlock. We will clear that flag
	 * when we finish cloning all extents, since a transaction is started
	 * after finding each extent to clone.
	 */
	set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);

	if (comp_type == BTRFS_COMPRESS_NONE) {
		memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
					datal);
	} else {
		ret = btrfs_decompress(comp_type, data_start, folio,
				       offset_in_folio(folio, file_offset),
				       inline_size, datal);
		if (ret)
			goto out_unlock;
		flush_dcache_folio(folio);
	}

	/*
	 * If our inline data is smaller then the block/page size, then the
	 * remaining of the block/page is equivalent to zeroes. We had something
	 * like the following done:
	 *
	 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
	 * $ sync  # (or fsync)
	 * $ xfs_io -c "falloc 0 4K" file
	 * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
	 *
	 * So what's in the range [500, 4095] corresponds to zeroes.
	 */
	if (datal < block_size)
		folio_zero_range(folio, datal, block_size - datal);

	btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
	btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
	btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
	if (!IS_ERR(folio)) {
		folio_unlock(folio);
		folio_put(folio);
	}
	if (ret)
		btrfs_delalloc_release_space(inode, data_reserved, file_offset,
					     block_size, true);
	btrfs_delalloc_release_extents(inode, block_size);
out:
	extent_changeset_free(data_reserved);

	return ret;
}

/*
 * Deal with cloning of inline extents. We try to copy the inline extent from
 * the source inode to destination inode when possible. When not possible we
 * copy the inline extent's data into the respective page of the inode.
 */
static int clone_copy_inline_extent(struct btrfs_inode *inode,
				    struct btrfs_path *path,
				    struct btrfs_key *new_key,
				    const u64 drop_start,
				    const u64 datal,
				    const u64 size,
				    const u8 comp_type,
				    char *inline_data,
				    struct btrfs_trans_handle **trans_out)
{
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	const u64 aligned_end = ALIGN(new_key->offset + datal,
				      fs_info->sectorsize);
	struct btrfs_trans_handle *trans = NULL;
	struct btrfs_drop_extents_args drop_args = { 0 };
	int ret;
	struct btrfs_key key;

	if (new_key->offset > 0) {
		ret = copy_inline_to_page(inode, new_key->offset,
					  inline_data, size, datal, comp_type);
		goto out;
	}

	key.objectid = btrfs_ino(inode);
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0) {
		return ret;
	} else if (ret > 0) {
		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				return ret;
			else if (ret > 0)
				goto copy_inline_extent;
		}
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid == btrfs_ino(inode) &&
		    key.type == BTRFS_EXTENT_DATA_KEY) {
			/*
			 * There's an implicit hole at file offset 0, copy the
			 * inline extent's data to the page.
			 */
			ASSERT(key.offset > 0);
			goto copy_to_page;
		}
	} else if (i_size_read(&inode->vfs_inode) <= datal) {
		struct btrfs_file_extent_item *ei;

		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				    struct btrfs_file_extent_item);
		/*
		 * If it's an inline extent replace it with the source inline
		 * extent, otherwise copy the source inline extent data into
		 * the respective page at the destination inode.
		 */
		if (btrfs_file_extent_type(path->nodes[0], ei) ==
		    BTRFS_FILE_EXTENT_INLINE)
			goto copy_inline_extent;

		goto copy_to_page;
	}

copy_inline_extent:
	/*
	 * We have no extent items, or we have an extent at offset 0 which may
	 * or may not be inlined. All these cases are dealt the same way.
	 */
	if (i_size_read(&inode->vfs_inode) > datal) {
		/*
		 * At the destination offset 0 we have either a hole, a regular
		 * extent or an inline extent larger then the one we want to
		 * clone. Deal with all these cases by copying the inline extent
		 * data into the respective page at the destination inode.
		 */
		goto copy_to_page;
	}

	/*
	 * Release path before starting a new transaction so we don't hold locks
	 * that would confuse lockdep.
	 */
	btrfs_release_path(path);
	/*
	 * If we end up here it means were copy the inline extent into a leaf
	 * of the destination inode. We know we will drop or adjust at most one
	 * extent item in the destination root.
	 *
	 * 1 unit - adjusting old extent (we may have to split it)
	 * 1 unit - add new extent
	 * 1 unit - inode update
	 */
	trans = btrfs_start_transaction(root, 3);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		trans = NULL;
		goto out;
	}
	drop_args.path = path;
	drop_args.start = drop_start;
	drop_args.end = aligned_end;
	drop_args.drop_cache = true;
	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
	if (unlikely(ret)) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
	if (unlikely(ret)) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}

	write_extent_buffer(path->nodes[0], inline_data,
			    btrfs_item_ptr_offset(path->nodes[0],
						  path->slots[0]),
			    size);
	btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
	btrfs_set_inode_full_sync(inode);
	ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
	if (unlikely(ret))
		btrfs_abort_transaction(trans, ret);
out:
	if (!ret && !trans) {
		/*
		 * No transaction here means we copied the inline extent into a
		 * page of the destination inode.
		 *
		 * 1 unit to update inode item
		 */
		trans = btrfs_start_transaction(root, 1);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			trans = NULL;
		}
	}
	if (ret && trans)
		btrfs_end_transaction(trans);
	if (!ret)
		*trans_out = trans;

	return ret;

copy_to_page:
	/*
	 * Release our path because we don't need it anymore and also because
	 * copy_inline_to_page() needs to reserve data and metadata, which may
	 * need to flush delalloc when we are low on available space and
	 * therefore cause a deadlock if writeback of an inline extent needs to
	 * write to the same leaf or an ordered extent completion needs to write
	 * to the same leaf.
	 */
	btrfs_release_path(path);

	ret = copy_inline_to_page(inode, new_key->offset,
				  inline_data, size, datal, comp_type);

	/*
	 * If we copied the inline extent data to a page/folio beyond the i_size
	 * of the destination inode, then we need to increase the i_size before
	 * we start a transaction to update the inode item. This is to prevent a
	 * deadlock when the flushoncommit mount option is used, which happens
	 * like this:
	 *
	 * 1) Task A clones an inline extent from inode X to an offset of inode
	 *    Y that is beyond Y's current i_size. This means we copied the
	 *    inline extent's data to a folio of inode Y that is beyond its EOF,
	 *    using the call above to copy_inline_to_page();
	 *
	 * 2) Task B starts a transaction commit and calls
	 *    btrfs_start_delalloc_flush() to flush delalloc;
	 *
	 * 3) The delalloc flushing sees the new dirty folio of inode Y and when
	 *    it attempts to flush it, it ends up at extent_writepage() and sees
	 *    that the offset of the folio is beyond the i_size of inode Y, so
	 *    it attempts to invalidate the folio by calling folio_invalidate(),
	 *    which ends up at btrfs' folio invalidate callback -
	 *    btrfs_invalidate_folio(). There it tries to lock the folio's range
	 *    in inode Y's extent io tree, but it blocks since it's currently
	 *    locked by task A - during reflink we lock the inodes and the
	 *    source and destination ranges after flushing all delalloc and
	 *    waiting for ordered extent completion - after that we don't expect
	 *    to have dirty folios in the ranges, the exception is if we have to
	 *    copy an inline extent's data (because the destination offset is
	 *    not zero);
	 *
	 * 4) Task A then does the 'goto out' below and attempts to start a
	 *    transaction to update the inode item, and then it's blocked since
	 *    the current transaction is in the TRANS_STATE_COMMIT_START state.
	 *    Therefore task A has to wait for the current transaction to become
	 *    unblocked (its state >= TRANS_STATE_UNBLOCKED).
	 *
	 * This leads to a deadlock - the task committing the transaction
	 * waiting for the delalloc flushing which is blocked during folio
	 * invalidation on the inode's extent lock and the reflink task waiting
	 * for the current transaction to be unblocked so that it can start a
	 * a new one to update the inode item (while holding the extent lock).
	 */
	if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode))
		i_size_write(&inode->vfs_inode, new_key->offset + datal);

	goto out;
}

/*
 * Clone a range from inode file to another.
 *
 * @src:             Inode to clone from
 * @inode:           Inode to clone to
 * @off:             Offset within source to start clone from
 * @olen:            Original length, passed by user, of range to clone
 * @olen_aligned:    Block-aligned value of olen
 * @destoff:         Offset within @inode to start clone
 * @no_time_update:  Whether to update mtime/ctime on the target inode
 */
static int btrfs_clone(struct inode *src, struct inode *inode,
		       const u64 off, const u64 olen, const u64 olen_aligned,
		       const u64 destoff, bool no_time_update)
{
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
	BTRFS_PATH_AUTO_FREE(path);
	struct extent_buffer *leaf;
	struct btrfs_trans_handle *trans;
	char AUTO_KVFREE(buf);
	struct btrfs_key key;
	u32 nritems;
	int slot;
	int ret;
	const u64 len = olen_aligned;
	u64 last_dest_end = destoff;
	u64 prev_extent_end = off;

	ret = -ENOMEM;
	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
	if (!buf)
		return ret;

	path = btrfs_alloc_path();
	if (!path)
		return ret;

	path->reada = READA_FORWARD;
	/* Clone data */
	key.objectid = btrfs_ino(BTRFS_I(src));
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = off;

	while (1) {
		struct btrfs_file_extent_item *extent;
		u64 extent_gen;
		int type;
		u32 size;
		struct btrfs_key new_key;
		u64 disko = 0, diskl = 0;
		u64 datao = 0, datal = 0;
		u8 comp;
		u64 drop_start;

		/* Note the key will change type as we walk through the tree */
		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				0, 0);
		if (ret < 0)
			goto out;
		/*
		 * First search, if no extent item that starts at offset off was
		 * found but the previous item is an extent item, it's possible
		 * it might overlap our target range, therefore process it.
		 */
		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0] - 1);
			if (key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
		}

		nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
			if (ret < 0)
				goto out;
			if (ret > 0)
				break;
			nritems = btrfs_header_nritems(path->nodes[0]);
		}
		leaf = path->nodes[0];
		slot = path->slots[0];

		btrfs_item_key_to_cpu(leaf, &key, slot);
		if (key.type > BTRFS_EXTENT_DATA_KEY ||
		    key.objectid != btrfs_ino(BTRFS_I(src)))
			break;

		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);

		extent = btrfs_item_ptr(leaf, slot,
					struct btrfs_file_extent_item);
		extent_gen = btrfs_file_extent_generation(leaf, extent);
		comp = btrfs_file_extent_compression(leaf, extent);
		type = btrfs_file_extent_type(leaf, extent);
		if (type == BTRFS_FILE_EXTENT_REG ||
		    type == BTRFS_FILE_EXTENT_PREALLOC) {
			disko = btrfs_file_extent_disk_bytenr(leaf, extent);
			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
			datao = btrfs_file_extent_offset(leaf, extent);
			datal = btrfs_file_extent_num_bytes(leaf, extent);
		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
			/* Take upper bound, may be compressed */
			datal = btrfs_file_extent_ram_bytes(leaf, extent);
		}

		/*
		 * The first search might have left us at an extent item that
		 * ends before our target range's start, can happen if we have
		 * holes and NO_HOLES feature enabled.
		 *
		 * Subsequent searches may leave us on a file range we have
		 * processed before - this happens due to a race with ordered
		 * extent completion for a file range that is outside our source
		 * range, but that range was part of a file extent item that
		 * also covered a leading part of our source range.
		 */
		if (key.offset + datal <= prev_extent_end) {
			path->slots[0]++;
			goto process_slot;
		} else if (key.offset >= off + len) {
			break;
		}

		prev_extent_end = key.offset + datal;
		size = btrfs_item_size(leaf, slot);
		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
				   size);

		btrfs_release_path(path);

		memcpy(&new_key, &key, sizeof(new_key));
		new_key.objectid = btrfs_ino(BTRFS_I(inode));
		if (off <= key.offset)
			new_key.offset = key.offset + destoff - off;
		else
			new_key.offset = destoff;

		/*
		 * Deal with a hole that doesn't have an extent item that
		 * represents it (NO_HOLES feature enabled).
		 * This hole is either in the middle of the cloning range or at
		 * the beginning (fully overlaps it or partially overlaps it).
		 */
		if (new_key.offset != last_dest_end)
			drop_start = last_dest_end;
		else
			drop_start = new_key.offset;

		if (type == BTRFS_FILE_EXTENT_REG ||
		    type == BTRFS_FILE_EXTENT_PREALLOC) {
			struct btrfs_replace_extent_info clone_info;

			/*
			 *    a  | --- range to clone ---|  b
			 * | ------------- extent ------------- |
			 */

			/* Subtract range b */
			if (key.offset + datal > off + len)
				datal = off + len - key.offset;

			/* Subtract range a */
			if (off > key.offset) {
				datao += off - key.offset;
				datal -= off - key.offset;
			}

			clone_info.disk_offset = disko;
			clone_info.disk_len = diskl;
			clone_info.data_offset = datao;
			clone_info.data_len = datal;
			clone_info.file_offset = new_key.offset;
			clone_info.extent_buf = buf;
			clone_info.is_new_extent = false;
			clone_info.update_times = !no_time_update;
			ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
					drop_start, new_key.offset + datal - 1,
					&clone_info, &trans);
			if (ret)
				goto out;
		} else {
			ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
			/*
			 * Inline extents always have to start at file offset 0
			 * and can never be bigger then the sector size. We can
			 * never clone only parts of an inline extent, since all
			 * reflink operations must start at a sector size aligned
			 * offset, and the length must be aligned too or end at
			 * the i_size (which implies the whole inlined data).
			 */
			ASSERT(key.offset == 0);
			ASSERT(datal <= fs_info->sectorsize);
			if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
			    WARN_ON(key.offset != 0) ||
			    WARN_ON(datal > fs_info->sectorsize)) {
				ret = -EUCLEAN;
				goto out;
			}

			ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
						       drop_start, datal, size,
						       comp, buf, &trans);
			if (ret)
				goto out;
		}

		btrfs_release_path(path);

		/*
		 * Whenever we share an extent we update the last_reflink_trans
		 * of each inode to the current transaction. This is needed to
		 * make sure fsync does not log multiple checksum items with
		 * overlapping ranges (because some extent items might refer
		 * only to sections of the original extent). For the destination
		 * inode we do this regardless of the generation of the extents
		 * or even if they are inline extents or explicit holes, to make
		 * sure a full fsync does not skip them. For the source inode,
		 * we only need to update last_reflink_trans in case it's a new
		 * extent that is not a hole or an inline extent, to deal with
		 * the checksums problem on fsync.
		 */
		if (extent_gen == trans->transid && disko > 0)
			BTRFS_I(src)->last_reflink_trans = trans->transid;

		BTRFS_I(inode)->last_reflink_trans = trans->transid;

		last_dest_end = ALIGN(new_key.offset + datal,
				      fs_info->sectorsize);
		ret = clone_finish_inode_update(trans, inode, last_dest_end,
						destoff, olen, no_time_update);
		if (ret)
			goto out;
		if (new_key.offset + datal >= destoff + len)
			break;

		btrfs_release_path(path);
		key.offset = prev_extent_end;

		if (fatal_signal_pending(current)) {
			ret = -EINTR;
			goto out;
		}

		cond_resched();
	}
	ret = 0;

	if (last_dest_end < destoff + len) {
		/*
		 * We have an implicit hole that fully or partially overlaps our
		 * cloning range at its end. This means that we either have the
		 * NO_HOLES feature enabled or the implicit hole happened due to
		 * mixing buffered and direct IO writes against this file.
		 */
		btrfs_release_path(path);

		/*
		 * When using NO_HOLES and we are cloning a range that covers
		 * only a hole (no extents) into a range beyond the current
		 * i_size, punching a hole in the target range will not create
		 * an extent map defining a hole, because the range starts at or
		 * beyond current i_size. If the file previously had an i_size
		 * greater than the new i_size set by this clone operation, we
		 * need to make sure the next fsync is a full fsync, so that it
		 * detects and logs a hole covering a range from the current
		 * i_size to the new i_size. If the clone range covers extents,
		 * besides a hole, then we know the full sync flag was already
		 * set by previous calls to btrfs_replace_file_extents() that
		 * replaced file extent items.
		 */
		if (last_dest_end >= i_size_read(inode))
			btrfs_set_inode_full_sync(BTRFS_I(inode));

		ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
				last_dest_end, destoff + len - 1, NULL, &trans);
		if (ret)
			goto out;

		ret = clone_finish_inode_update(trans, inode, destoff + len,
						destoff, olen, no_time_update);
	}

out:
	clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);

	return ret;
}

static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
	if (inode1 < inode2)
		swap(inode1, inode2);
	down_write(&inode1->i_mmap_lock);
	down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}

static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
	up_write(&inode1->i_mmap_lock);
	up_write(&inode2->i_mmap_lock);
}

static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
				   struct btrfs_inode *dst, u64 dst_loff)
{
	const u64 end = dst_loff + len - 1;
	struct extent_state *cached_state = NULL;
	struct btrfs_fs_info *fs_info = src->root->fs_info;
	const u64 bs = fs_info->sectorsize;
	int ret;

	/*
	 * Lock destination range to serialize with concurrent readahead(), and
	 * we are safe from concurrency with relocation of source extents
	 * because we have already locked the inode's i_mmap_lock in exclusive
	 * mode.
	 */
	btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
	ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
			  ALIGN(len, bs), dst_loff, true);
	btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);

	btrfs_btree_balance_dirty(fs_info);

	return ret;
}

static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
			     struct inode *dst, u64 dst_loff)
{
	int ret = 0;
	u64 i, tail_len, chunk_count;
	struct btrfs_root *root_dst = BTRFS_I(dst)->root;

	spin_lock(&root_dst->root_item_lock);
	if (root_dst->send_in_progress) {
		btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
			      btrfs_root_id(root_dst),
			      root_dst->send_in_progress);
		spin_unlock(&root_dst->root_item_lock);
		return -EAGAIN;
	}
	root_dst->dedupe_in_progress++;
	spin_unlock(&root_dst->root_item_lock);

	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);

	for (i = 0; i < chunk_count; i++) {
		ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
					      BTRFS_I(dst), dst_loff);
		if (ret)
			goto out;

		loff += BTRFS_MAX_DEDUPE_LEN;
		dst_loff += BTRFS_MAX_DEDUPE_LEN;
	}

	if (tail_len > 0)
		ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
					      BTRFS_I(dst), dst_loff);
out:
	spin_lock(&root_dst->root_item_lock);
	root_dst->dedupe_in_progress--;
	spin_unlock(&root_dst->root_item_lock);

	return ret;
}

static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
					u64 off, u64 olen, u64 destoff)
{
	struct extent_state *cached_state = NULL;
	struct inode *inode = file_inode(file);
	struct inode *src = file_inode(file_src);
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
	int ret;
	u64 len = olen;
	u64 bs = fs_info->sectorsize;
	u64 end;

	/*
	 * VFS's generic_remap_file_range_prep() protects us from cloning the
	 * eof block into the middle of a file, which would result in corruption
	 * if the file size is not blocksize aligned. So we don't need to check
	 * for that case here.
	 */
	if (off + len == src->i_size)
		len = ALIGN(src->i_size, bs) - off;

	if (destoff > inode->i_size) {
		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);

		ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
		if (ret)
			return ret;
		/*
		 * We may have truncated the last block if the inode's size is
		 * not sector size aligned, so we need to wait for writeback to
		 * complete before proceeding further, otherwise we can race
		 * with cloning and attempt to increment a reference to an
		 * extent that no longer exists (writeback completed right after
		 * we found the previous extent covering eof and before we
		 * attempted to increment its reference count).
		 */
		ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
					       destoff - wb_start);
		if (ret)
			return ret;
	}

	/*
	 * Lock destination range to serialize with concurrent readahead(), and
	 * we are safe from concurrency with relocation of source extents
	 * because we have already locked the inode's i_mmap_lock in exclusive
	 * mode.
	 */
	end = destoff + len - 1;
	btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
	ret = btrfs_clone(src, inode, off, olen, len, destoff, false);
	btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
	if (ret < 0)
		return ret;

	/*
	 * We may have copied an inline extent into a page of the destination
	 * range. So flush delalloc and wait for ordered extent completion.
	 * This is to ensure the invalidation below does not fail, as if for
	 * example it finds a dirty folio, our folio release callback
	 * (btrfs_release_folio()) returns false, which makes the invalidation
	 * return an -EBUSY error. We can't ignore such failures since they
	 * could come from some range other than the copied inline extent's
	 * destination range and we have no way to know that.
	 */
	ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
	if (ret < 0)
		return ret;

	/*
	 * Invalidate page cache so that future reads will see the cloned data
	 * immediately and not the previous data.
	 */
	ret = filemap_invalidate_inode(inode, false, destoff, end);
	if (ret < 0)
		return ret;

	btrfs_btree_balance_dirty(fs_info);

	return 0;
}

static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
				       struct file *file_out, loff_t pos_out,
				       loff_t *len, unsigned int remap_flags)
{
	struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
	struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
	u64 bs = inode_out->root->fs_info->sectorsize;
	u64 wb_len;
	int ret;

	if (!(remap_flags & REMAP_FILE_DEDUP)) {
		struct btrfs_root *root_out = inode_out->root;

		if (btrfs_root_readonly(root_out))
			return -EROFS;

		ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
	}

	/* Can only reflink encrypted files if both files are encrypted. */
	if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
		return -EINVAL;

	/* Don't make the dst file partly checksummed */
	if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
	    (inode_out->flags & BTRFS_INODE_NODATASUM)) {
		return -EINVAL;
	}

	/*
	 * Now that the inodes are locked, we need to start writeback ourselves
	 * and can not rely on the writeback from the VFS's generic helper
	 * generic_remap_file_range_prep() because:
	 *
	 * 1) For compression we must call filemap_fdatawrite_range() range
	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic
	 *    helper only calls it once;
	 *
	 * 2) filemap_fdatawrite_range(), called by the generic helper only
	 *    waits for the writeback to complete, i.e. for IO to be done, and
	 *    not for the ordered extents to complete. We need to wait for them
	 *    to complete so that new file extent items are in the fs tree.
	 */
	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
		wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
	else
		wb_len = ALIGN(*len, bs);

	/*
	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
	 *
	 * Btrfs' back references do not have a block level granularity, they
	 * work at the whole extent level.
	 * NOCOW buffered write without data space reserved may not be able
	 * to fall back to CoW due to lack of data space, thus could cause
	 * data loss.
	 *
	 * Here we take a shortcut by flushing the whole inode, so that all
	 * nocow write should reach disk as nocow before we increase the
	 * reference of the extent. We could do better by only flushing NOCOW
	 * data, but that needs extra accounting.
	 *
	 * Also we don't need to check ASYNC_EXTENT, as async extent will be
	 * CoWed anyway, not affecting nocow part.
	 */
	ret = filemap_flush(inode_in->vfs_inode.i_mapping);
	if (ret < 0)
		return ret;

	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
	if (ret < 0)
		return ret;
	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
	if (ret < 0)
		return ret;

	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
					    len, remap_flags);
}

static bool file_sync_write(const struct file *file)
{
	if (file->f_flags & (__O_SYNC | O_DSYNC))
		return true;
	if (IS_SYNC(file_inode(file)))
		return true;

	return false;
}

loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
		struct file *dst_file, loff_t destoff, loff_t len,
		unsigned int remap_flags)
{
	struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
	struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
	bool same_inode = dst_inode == src_inode;
	int ret;

	if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))
		return -EIO;

	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
		return -EINVAL;

	if (same_inode) {
		btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
	} else {
		lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
		btrfs_double_mmap_lock(src_inode, dst_inode);
	}

	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
					  &len, remap_flags);
	if (ret < 0 || len == 0)
		goto out_unlock;

	if (remap_flags & REMAP_FILE_DEDUP)
		ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
					&dst_inode->vfs_inode, destoff);
	else
		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);

out_unlock:
	if (same_inode) {
		btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
	} else {
		btrfs_double_mmap_unlock(src_inode, dst_inode);
		unlock_two_nondirectories(&src_inode->vfs_inode,
					  &dst_inode->vfs_inode);
	}

	/*
	 * If either the source or the destination file was opened with O_SYNC,
	 * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
	 * source files/ranges, so that after a successful return (0) followed
	 * by a power failure results in the reflinked data to be readable from
	 * both files/ranges.
	 */
	if (ret == 0 && len > 0 &&
	    (file_sync_write(src_file) || file_sync_write(dst_file))) {
		ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
		if (ret == 0)
			ret = btrfs_sync_file(dst_file, destoff,
					      destoff + len - 1, 0);
	}

	return ret < 0 ? ret : len;
}