mirror of
https://github.com/torvalds/linux.git
synced 2026-06-04 12:35:52 +02:00
When using the flushoncommit mount option, we can have a deadlock between
a transaction commit and a reflink operation that copied an inline extent
to an offset beyond the current i_size of the destination node.
The deadlock happens like this:
1) Task A clones an inline extent from inode X to an offset of inode Y
that is beyond Y's current i_size. This means we copied the inline
extent's data to a folio of inode Y that is beyond its EOF, using a
call to copy_inline_to_page();
2) Task B starts a transaction commit and calls
btrfs_start_delalloc_flush() to flush delalloc;
3) The delalloc flushing sees the new dirty folio of inode Y and when it
attempts to flush it, it ends up at extent_writepage() and sees that
the offset of the folio is beyond the i_size of inode Y, so it attempts
to invalidate the folio by calling folio_invalidate(), which ends up at
btrfs' folio invalidate callback - btrfs_invalidate_folio(). There it
tries to lock the folio's range in inode Y's extent io tree, but it
blocks since it's currently locked by task A - during a reflink we lock
the inodes and the source and destination ranges after flushing all
delalloc and waiting for ordered extent completion - after that we
don't expect to have dirty folios in the ranges, the exception is if
we have to copy an inline extent's data (because the destination offset
is not zero);
4) Task A then attempts to start a transaction to update the inode item,
and then it's blocked since the current transaction is in the
TRANS_STATE_COMMIT_START state. Therefore task A has to wait for the
current transaction to become unblocked (its state >=
TRANS_STATE_UNBLOCKED).
So task A is waiting for the transaction commit done by task B, and
the later waiting on the extent lock of inode Y that is currently
held by task A.
Syzbot recently reported this with the following stack traces:
INFO: task kworker/u8:7:1053 blocked for more than 143 seconds.
Not tainted syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u8:7 state:D stack:23520 pid:1053 tgid:1053 ppid:2 task_flags:0x4208060 flags:0x00080000
Workqueue: writeback wb_workfn (flush-btrfs-46)
Call Trace:
<TASK>
context_switch kernel/sched/core.c:5298 [inline]
__schedule+0x1553/0x5240 kernel/sched/core.c:6911
__schedule_loop kernel/sched/core.c:6993 [inline]
schedule+0x164/0x360 kernel/sched/core.c:7008
wait_extent_bit fs/btrfs/extent-io-tree.c:811 [inline]
btrfs_lock_extent_bits+0x59c/0x700 fs/btrfs/extent-io-tree.c:1914
btrfs_lock_extent fs/btrfs/extent-io-tree.h:152 [inline]
btrfs_invalidate_folio+0x43d/0xc40 fs/btrfs/inode.c:7704
extent_writepage fs/btrfs/extent_io.c:1852 [inline]
extent_write_cache_pages fs/btrfs/extent_io.c:2580 [inline]
btrfs_writepages+0x12ff/0x2440 fs/btrfs/extent_io.c:2713
do_writepages+0x32e/0x550 mm/page-writeback.c:2554
__writeback_single_inode+0x133/0x11a0 fs/fs-writeback.c:1750
writeback_sb_inodes+0x995/0x19d0 fs/fs-writeback.c:2042
wb_writeback+0x456/0xb70 fs/fs-writeback.c:2227
wb_do_writeback fs/fs-writeback.c:2374 [inline]
wb_workfn+0x41a/0xf60 fs/fs-writeback.c:2414
process_one_work kernel/workqueue.c:3276 [inline]
process_scheduled_works+0xb6e/0x18c0 kernel/workqueue.c:3359
worker_thread+0xa53/0xfc0 kernel/workqueue.c:3440
kthread+0x388/0x470 kernel/kthread.c:436
ret_from_fork+0x51e/0xb90 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
INFO: task syz.4.64:6910 blocked for more than 143 seconds.
Not tainted syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.4.64 state:D stack:22752 pid:6910 tgid:6905 ppid:5944 task_flags:0x400140 flags:0x00080002
Call Trace:
<TASK>
context_switch kernel/sched/core.c:5298 [inline]
__schedule+0x1553/0x5240 kernel/sched/core.c:6911
__schedule_loop kernel/sched/core.c:6993 [inline]
schedule+0x164/0x360 kernel/sched/core.c:7008
wait_current_trans+0x39f/0x590 fs/btrfs/transaction.c:535
start_transaction+0x6a7/0x1650 fs/btrfs/transaction.c:705
clone_copy_inline_extent fs/btrfs/reflink.c:299 [inline]
btrfs_clone+0x128a/0x24d0 fs/btrfs/reflink.c:529
btrfs_clone_files+0x271/0x3f0 fs/btrfs/reflink.c:750
btrfs_remap_file_range+0x76b/0x1320 fs/btrfs/reflink.c:903
vfs_copy_file_range+0xda7/0x1390 fs/read_write.c:1600
__do_sys_copy_file_range fs/read_write.c:1683 [inline]
__se_sys_copy_file_range+0x2fb/0x480 fs/read_write.c:1650
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f5f73afc799
RSP: 002b:00007f5f7315e028 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
RAX: ffffffffffffffda RBX: 00007f5f73d75fa0 RCX: 00007f5f73afc799
RDX: 0000000000000005 RSI: 0000000000000000 RDI: 0000000000000005
RBP: 00007f5f73b92c99 R08: 0000000000000863 R09: 0000000000000000
R10: 00002000000000c0 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f5f73d76038 R14: 00007f5f73d75fa0 R15: 00007fff138a5068
</TASK>
INFO: task syz.4.64:6975 blocked for more than 143 seconds.
Not tainted syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.4.64 state:D stack:24736 pid:6975 tgid:6905 ppid:5944 task_flags:0x400040 flags:0x00080002
Call Trace:
<TASK>
context_switch kernel/sched/core.c:5298 [inline]
__schedule+0x1553/0x5240 kernel/sched/core.c:6911
__schedule_loop kernel/sched/core.c:6993 [inline]
schedule+0x164/0x360 kernel/sched/core.c:7008
wb_wait_for_completion+0x3e8/0x790 fs/fs-writeback.c:227
__writeback_inodes_sb_nr+0x24c/0x2d0 fs/fs-writeback.c:2838
try_to_writeback_inodes_sb+0x9a/0xc0 fs/fs-writeback.c:2886
btrfs_start_delalloc_flush fs/btrfs/transaction.c:2175 [inline]
btrfs_commit_transaction+0x82e/0x31a0 fs/btrfs/transaction.c:2364
btrfs_ioctl+0xca7/0xd00 fs/btrfs/ioctl.c:5206
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:597 [inline]
__se_sys_ioctl+0xff/0x170 fs/ioctl.c:583
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f5f73afc799
RSP: 002b:00007f5f7313d028 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007f5f73d76090 RCX: 00007f5f73afc799
RDX: 0000000000000000 RSI: 0000000000009408 RDI: 0000000000000004
RBP: 00007f5f73b92c99 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f5f73d76128 R14: 00007f5f73d76090 R15: 00007fff138a5068
</TASK>
Fix this by updating the i_size of the destination inode of a reflink
operation after we copy an inline extent's data to an offset beyond the
i_size and before attempting to start a transaction to update the inode's
item.
Reported-by: syzbot+63056bf627663701bbbf@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/69bba3fe.050a0220.227207.002f.GAE@google.com/
Fixes: 05a5a7621c ("Btrfs: implement full reflink support for inline extents")
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
976 lines
31 KiB
C
976 lines
31 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/fscrypt.h>
|
|
#include <linux/iversion.h>
|
|
#include "ctree.h"
|
|
#include "fs.h"
|
|
#include "messages.h"
|
|
#include "compression.h"
|
|
#include "delalloc-space.h"
|
|
#include "disk-io.h"
|
|
#include "reflink.h"
|
|
#include "transaction.h"
|
|
#include "subpage.h"
|
|
#include "accessors.h"
|
|
#include "file-item.h"
|
|
#include "file.h"
|
|
#include "super.h"
|
|
|
|
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
|
|
|
|
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
|
|
struct inode *inode,
|
|
u64 endoff,
|
|
const u64 destoff,
|
|
const u64 olen,
|
|
bool no_time_update)
|
|
{
|
|
int ret;
|
|
|
|
inode_inc_iversion(inode);
|
|
if (!no_time_update) {
|
|
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
|
|
}
|
|
/*
|
|
* We round up to the block size at eof when determining which
|
|
* extents to clone above, but shouldn't round up the file size.
|
|
*/
|
|
if (endoff > destoff + olen)
|
|
endoff = destoff + olen;
|
|
if (endoff > inode->i_size) {
|
|
i_size_write(inode, endoff);
|
|
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
|
|
}
|
|
|
|
ret = btrfs_update_inode(trans, BTRFS_I(inode));
|
|
if (unlikely(ret)) {
|
|
btrfs_abort_transaction(trans, ret);
|
|
btrfs_end_transaction(trans);
|
|
return ret;
|
|
}
|
|
return btrfs_end_transaction(trans);
|
|
}
|
|
|
|
static int copy_inline_to_page(struct btrfs_inode *inode,
|
|
const u64 file_offset,
|
|
char *inline_data,
|
|
const u64 size,
|
|
const u64 datal,
|
|
const u8 comp_type)
|
|
{
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
const u32 block_size = fs_info->sectorsize;
|
|
const u64 range_end = file_offset + block_size - 1;
|
|
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
|
|
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
|
|
struct extent_changeset *data_reserved = NULL;
|
|
struct folio *folio = NULL;
|
|
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
|
int ret;
|
|
|
|
ASSERT(IS_ALIGNED(file_offset, block_size));
|
|
|
|
/*
|
|
* We have flushed and locked the ranges of the source and destination
|
|
* inodes, we also have locked the inodes, so we are safe to do a
|
|
* reservation here. Also we must not do the reservation while holding
|
|
* a transaction open, otherwise we would deadlock.
|
|
*/
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
|
|
block_size);
|
|
if (ret)
|
|
goto out;
|
|
|
|
folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
|
|
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
|
|
btrfs_alloc_write_mask(mapping));
|
|
if (IS_ERR(folio)) {
|
|
ret = PTR_ERR(folio);
|
|
goto out_unlock;
|
|
}
|
|
|
|
ret = set_folio_extent_mapped(folio);
|
|
if (ret < 0)
|
|
goto out_unlock;
|
|
|
|
btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
|
|
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
|
|
if (ret)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* After dirtying the page our caller will need to start a transaction,
|
|
* and if we are low on metadata free space, that can cause flushing of
|
|
* delalloc for all inodes in order to get metadata space released.
|
|
* However we are holding the range locked for the whole duration of
|
|
* the clone/dedupe operation, so we may deadlock if that happens and no
|
|
* other task releases enough space. So mark this inode as not being
|
|
* possible to flush to avoid such deadlock. We will clear that flag
|
|
* when we finish cloning all extents, since a transaction is started
|
|
* after finding each extent to clone.
|
|
*/
|
|
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
|
|
|
|
if (comp_type == BTRFS_COMPRESS_NONE) {
|
|
memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
|
|
datal);
|
|
} else {
|
|
ret = btrfs_decompress(comp_type, data_start, folio,
|
|
offset_in_folio(folio, file_offset),
|
|
inline_size, datal);
|
|
if (ret)
|
|
goto out_unlock;
|
|
flush_dcache_folio(folio);
|
|
}
|
|
|
|
/*
|
|
* If our inline data is smaller then the block/page size, then the
|
|
* remaining of the block/page is equivalent to zeroes. We had something
|
|
* like the following done:
|
|
*
|
|
* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
|
|
* $ sync # (or fsync)
|
|
* $ xfs_io -c "falloc 0 4K" file
|
|
* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
|
|
*
|
|
* So what's in the range [500, 4095] corresponds to zeroes.
|
|
*/
|
|
if (datal < block_size)
|
|
folio_zero_range(folio, datal, block_size - datal);
|
|
|
|
btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
|
|
btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
|
|
btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
|
|
out_unlock:
|
|
if (!IS_ERR(folio)) {
|
|
folio_unlock(folio);
|
|
folio_put(folio);
|
|
}
|
|
if (ret)
|
|
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
|
|
block_size, true);
|
|
btrfs_delalloc_release_extents(inode, block_size);
|
|
out:
|
|
extent_changeset_free(data_reserved);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Deal with cloning of inline extents. We try to copy the inline extent from
|
|
* the source inode to destination inode when possible. When not possible we
|
|
* copy the inline extent's data into the respective page of the inode.
|
|
*/
|
|
static int clone_copy_inline_extent(struct btrfs_inode *inode,
|
|
struct btrfs_path *path,
|
|
struct btrfs_key *new_key,
|
|
const u64 drop_start,
|
|
const u64 datal,
|
|
const u64 size,
|
|
const u8 comp_type,
|
|
char *inline_data,
|
|
struct btrfs_trans_handle **trans_out)
|
|
{
|
|
struct btrfs_root *root = inode->root;
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
const u64 aligned_end = ALIGN(new_key->offset + datal,
|
|
fs_info->sectorsize);
|
|
struct btrfs_trans_handle *trans = NULL;
|
|
struct btrfs_drop_extents_args drop_args = { 0 };
|
|
int ret;
|
|
struct btrfs_key key;
|
|
|
|
if (new_key->offset > 0) {
|
|
ret = copy_inline_to_page(inode, new_key->offset,
|
|
inline_data, size, datal, comp_type);
|
|
goto out;
|
|
}
|
|
|
|
key.objectid = btrfs_ino(inode);
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
key.offset = 0;
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
if (ret < 0) {
|
|
return ret;
|
|
} else if (ret > 0) {
|
|
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
|
ret = btrfs_next_leaf(root, path);
|
|
if (ret < 0)
|
|
return ret;
|
|
else if (ret > 0)
|
|
goto copy_inline_extent;
|
|
}
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
|
if (key.objectid == btrfs_ino(inode) &&
|
|
key.type == BTRFS_EXTENT_DATA_KEY) {
|
|
/*
|
|
* There's an implicit hole at file offset 0, copy the
|
|
* inline extent's data to the page.
|
|
*/
|
|
ASSERT(key.offset > 0);
|
|
goto copy_to_page;
|
|
}
|
|
} else if (i_size_read(&inode->vfs_inode) <= datal) {
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
struct btrfs_file_extent_item);
|
|
/*
|
|
* If it's an inline extent replace it with the source inline
|
|
* extent, otherwise copy the source inline extent data into
|
|
* the respective page at the destination inode.
|
|
*/
|
|
if (btrfs_file_extent_type(path->nodes[0], ei) ==
|
|
BTRFS_FILE_EXTENT_INLINE)
|
|
goto copy_inline_extent;
|
|
|
|
goto copy_to_page;
|
|
}
|
|
|
|
copy_inline_extent:
|
|
/*
|
|
* We have no extent items, or we have an extent at offset 0 which may
|
|
* or may not be inlined. All these cases are dealt the same way.
|
|
*/
|
|
if (i_size_read(&inode->vfs_inode) > datal) {
|
|
/*
|
|
* At the destination offset 0 we have either a hole, a regular
|
|
* extent or an inline extent larger then the one we want to
|
|
* clone. Deal with all these cases by copying the inline extent
|
|
* data into the respective page at the destination inode.
|
|
*/
|
|
goto copy_to_page;
|
|
}
|
|
|
|
/*
|
|
* Release path before starting a new transaction so we don't hold locks
|
|
* that would confuse lockdep.
|
|
*/
|
|
btrfs_release_path(path);
|
|
/*
|
|
* If we end up here it means were copy the inline extent into a leaf
|
|
* of the destination inode. We know we will drop or adjust at most one
|
|
* extent item in the destination root.
|
|
*
|
|
* 1 unit - adjusting old extent (we may have to split it)
|
|
* 1 unit - add new extent
|
|
* 1 unit - inode update
|
|
*/
|
|
trans = btrfs_start_transaction(root, 3);
|
|
if (IS_ERR(trans)) {
|
|
ret = PTR_ERR(trans);
|
|
trans = NULL;
|
|
goto out;
|
|
}
|
|
drop_args.path = path;
|
|
drop_args.start = drop_start;
|
|
drop_args.end = aligned_end;
|
|
drop_args.drop_cache = true;
|
|
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
|
|
if (unlikely(ret)) {
|
|
btrfs_abort_transaction(trans, ret);
|
|
goto out;
|
|
}
|
|
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
|
|
if (unlikely(ret)) {
|
|
btrfs_abort_transaction(trans, ret);
|
|
goto out;
|
|
}
|
|
|
|
write_extent_buffer(path->nodes[0], inline_data,
|
|
btrfs_item_ptr_offset(path->nodes[0],
|
|
path->slots[0]),
|
|
size);
|
|
btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
|
|
btrfs_set_inode_full_sync(inode);
|
|
ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
|
|
if (unlikely(ret))
|
|
btrfs_abort_transaction(trans, ret);
|
|
out:
|
|
if (!ret && !trans) {
|
|
/*
|
|
* No transaction here means we copied the inline extent into a
|
|
* page of the destination inode.
|
|
*
|
|
* 1 unit to update inode item
|
|
*/
|
|
trans = btrfs_start_transaction(root, 1);
|
|
if (IS_ERR(trans)) {
|
|
ret = PTR_ERR(trans);
|
|
trans = NULL;
|
|
}
|
|
}
|
|
if (ret && trans)
|
|
btrfs_end_transaction(trans);
|
|
if (!ret)
|
|
*trans_out = trans;
|
|
|
|
return ret;
|
|
|
|
copy_to_page:
|
|
/*
|
|
* Release our path because we don't need it anymore and also because
|
|
* copy_inline_to_page() needs to reserve data and metadata, which may
|
|
* need to flush delalloc when we are low on available space and
|
|
* therefore cause a deadlock if writeback of an inline extent needs to
|
|
* write to the same leaf or an ordered extent completion needs to write
|
|
* to the same leaf.
|
|
*/
|
|
btrfs_release_path(path);
|
|
|
|
ret = copy_inline_to_page(inode, new_key->offset,
|
|
inline_data, size, datal, comp_type);
|
|
|
|
/*
|
|
* If we copied the inline extent data to a page/folio beyond the i_size
|
|
* of the destination inode, then we need to increase the i_size before
|
|
* we start a transaction to update the inode item. This is to prevent a
|
|
* deadlock when the flushoncommit mount option is used, which happens
|
|
* like this:
|
|
*
|
|
* 1) Task A clones an inline extent from inode X to an offset of inode
|
|
* Y that is beyond Y's current i_size. This means we copied the
|
|
* inline extent's data to a folio of inode Y that is beyond its EOF,
|
|
* using the call above to copy_inline_to_page();
|
|
*
|
|
* 2) Task B starts a transaction commit and calls
|
|
* btrfs_start_delalloc_flush() to flush delalloc;
|
|
*
|
|
* 3) The delalloc flushing sees the new dirty folio of inode Y and when
|
|
* it attempts to flush it, it ends up at extent_writepage() and sees
|
|
* that the offset of the folio is beyond the i_size of inode Y, so
|
|
* it attempts to invalidate the folio by calling folio_invalidate(),
|
|
* which ends up at btrfs' folio invalidate callback -
|
|
* btrfs_invalidate_folio(). There it tries to lock the folio's range
|
|
* in inode Y's extent io tree, but it blocks since it's currently
|
|
* locked by task A - during reflink we lock the inodes and the
|
|
* source and destination ranges after flushing all delalloc and
|
|
* waiting for ordered extent completion - after that we don't expect
|
|
* to have dirty folios in the ranges, the exception is if we have to
|
|
* copy an inline extent's data (because the destination offset is
|
|
* not zero);
|
|
*
|
|
* 4) Task A then does the 'goto out' below and attempts to start a
|
|
* transaction to update the inode item, and then it's blocked since
|
|
* the current transaction is in the TRANS_STATE_COMMIT_START state.
|
|
* Therefore task A has to wait for the current transaction to become
|
|
* unblocked (its state >= TRANS_STATE_UNBLOCKED).
|
|
*
|
|
* This leads to a deadlock - the task committing the transaction
|
|
* waiting for the delalloc flushing which is blocked during folio
|
|
* invalidation on the inode's extent lock and the reflink task waiting
|
|
* for the current transaction to be unblocked so that it can start a
|
|
* a new one to update the inode item (while holding the extent lock).
|
|
*/
|
|
if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode))
|
|
i_size_write(&inode->vfs_inode, new_key->offset + datal);
|
|
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Clone a range from inode file to another.
|
|
*
|
|
* @src: Inode to clone from
|
|
* @inode: Inode to clone to
|
|
* @off: Offset within source to start clone from
|
|
* @olen: Original length, passed by user, of range to clone
|
|
* @olen_aligned: Block-aligned value of olen
|
|
* @destoff: Offset within @inode to start clone
|
|
* @no_time_update: Whether to update mtime/ctime on the target inode
|
|
*/
|
|
static int btrfs_clone(struct inode *src, struct inode *inode,
|
|
const u64 off, const u64 olen, const u64 olen_aligned,
|
|
const u64 destoff, bool no_time_update)
|
|
{
|
|
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
|
|
BTRFS_PATH_AUTO_FREE(path);
|
|
struct extent_buffer *leaf;
|
|
struct btrfs_trans_handle *trans;
|
|
char AUTO_KVFREE(buf);
|
|
struct btrfs_key key;
|
|
u32 nritems;
|
|
int slot;
|
|
int ret;
|
|
const u64 len = olen_aligned;
|
|
u64 last_dest_end = destoff;
|
|
u64 prev_extent_end = off;
|
|
|
|
ret = -ENOMEM;
|
|
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
|
|
if (!buf)
|
|
return ret;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return ret;
|
|
|
|
path->reada = READA_FORWARD;
|
|
/* Clone data */
|
|
key.objectid = btrfs_ino(BTRFS_I(src));
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
key.offset = off;
|
|
|
|
while (1) {
|
|
struct btrfs_file_extent_item *extent;
|
|
u64 extent_gen;
|
|
int type;
|
|
u32 size;
|
|
struct btrfs_key new_key;
|
|
u64 disko = 0, diskl = 0;
|
|
u64 datao = 0, datal = 0;
|
|
u8 comp;
|
|
u64 drop_start;
|
|
|
|
/* Note the key will change type as we walk through the tree */
|
|
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
|
|
0, 0);
|
|
if (ret < 0)
|
|
goto out;
|
|
/*
|
|
* First search, if no extent item that starts at offset off was
|
|
* found but the previous item is an extent item, it's possible
|
|
* it might overlap our target range, therefore process it.
|
|
*/
|
|
if (key.offset == off && ret > 0 && path->slots[0] > 0) {
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
path->slots[0] - 1);
|
|
if (key.type == BTRFS_EXTENT_DATA_KEY)
|
|
path->slots[0]--;
|
|
}
|
|
|
|
nritems = btrfs_header_nritems(path->nodes[0]);
|
|
process_slot:
|
|
if (path->slots[0] >= nritems) {
|
|
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
|
|
if (ret < 0)
|
|
goto out;
|
|
if (ret > 0)
|
|
break;
|
|
nritems = btrfs_header_nritems(path->nodes[0]);
|
|
}
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
if (key.type > BTRFS_EXTENT_DATA_KEY ||
|
|
key.objectid != btrfs_ino(BTRFS_I(src)))
|
|
break;
|
|
|
|
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
|
|
|
|
extent = btrfs_item_ptr(leaf, slot,
|
|
struct btrfs_file_extent_item);
|
|
extent_gen = btrfs_file_extent_generation(leaf, extent);
|
|
comp = btrfs_file_extent_compression(leaf, extent);
|
|
type = btrfs_file_extent_type(leaf, extent);
|
|
if (type == BTRFS_FILE_EXTENT_REG ||
|
|
type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
disko = btrfs_file_extent_disk_bytenr(leaf, extent);
|
|
diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
|
|
datao = btrfs_file_extent_offset(leaf, extent);
|
|
datal = btrfs_file_extent_num_bytes(leaf, extent);
|
|
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
|
|
/* Take upper bound, may be compressed */
|
|
datal = btrfs_file_extent_ram_bytes(leaf, extent);
|
|
}
|
|
|
|
/*
|
|
* The first search might have left us at an extent item that
|
|
* ends before our target range's start, can happen if we have
|
|
* holes and NO_HOLES feature enabled.
|
|
*
|
|
* Subsequent searches may leave us on a file range we have
|
|
* processed before - this happens due to a race with ordered
|
|
* extent completion for a file range that is outside our source
|
|
* range, but that range was part of a file extent item that
|
|
* also covered a leading part of our source range.
|
|
*/
|
|
if (key.offset + datal <= prev_extent_end) {
|
|
path->slots[0]++;
|
|
goto process_slot;
|
|
} else if (key.offset >= off + len) {
|
|
break;
|
|
}
|
|
|
|
prev_extent_end = key.offset + datal;
|
|
size = btrfs_item_size(leaf, slot);
|
|
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
|
|
size);
|
|
|
|
btrfs_release_path(path);
|
|
|
|
memcpy(&new_key, &key, sizeof(new_key));
|
|
new_key.objectid = btrfs_ino(BTRFS_I(inode));
|
|
if (off <= key.offset)
|
|
new_key.offset = key.offset + destoff - off;
|
|
else
|
|
new_key.offset = destoff;
|
|
|
|
/*
|
|
* Deal with a hole that doesn't have an extent item that
|
|
* represents it (NO_HOLES feature enabled).
|
|
* This hole is either in the middle of the cloning range or at
|
|
* the beginning (fully overlaps it or partially overlaps it).
|
|
*/
|
|
if (new_key.offset != last_dest_end)
|
|
drop_start = last_dest_end;
|
|
else
|
|
drop_start = new_key.offset;
|
|
|
|
if (type == BTRFS_FILE_EXTENT_REG ||
|
|
type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
struct btrfs_replace_extent_info clone_info;
|
|
|
|
/*
|
|
* a | --- range to clone ---| b
|
|
* | ------------- extent ------------- |
|
|
*/
|
|
|
|
/* Subtract range b */
|
|
if (key.offset + datal > off + len)
|
|
datal = off + len - key.offset;
|
|
|
|
/* Subtract range a */
|
|
if (off > key.offset) {
|
|
datao += off - key.offset;
|
|
datal -= off - key.offset;
|
|
}
|
|
|
|
clone_info.disk_offset = disko;
|
|
clone_info.disk_len = diskl;
|
|
clone_info.data_offset = datao;
|
|
clone_info.data_len = datal;
|
|
clone_info.file_offset = new_key.offset;
|
|
clone_info.extent_buf = buf;
|
|
clone_info.is_new_extent = false;
|
|
clone_info.update_times = !no_time_update;
|
|
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
|
|
drop_start, new_key.offset + datal - 1,
|
|
&clone_info, &trans);
|
|
if (ret)
|
|
goto out;
|
|
} else {
|
|
ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
|
|
/*
|
|
* Inline extents always have to start at file offset 0
|
|
* and can never be bigger then the sector size. We can
|
|
* never clone only parts of an inline extent, since all
|
|
* reflink operations must start at a sector size aligned
|
|
* offset, and the length must be aligned too or end at
|
|
* the i_size (which implies the whole inlined data).
|
|
*/
|
|
ASSERT(key.offset == 0);
|
|
ASSERT(datal <= fs_info->sectorsize);
|
|
if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
|
|
WARN_ON(key.offset != 0) ||
|
|
WARN_ON(datal > fs_info->sectorsize)) {
|
|
ret = -EUCLEAN;
|
|
goto out;
|
|
}
|
|
|
|
ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
|
|
drop_start, datal, size,
|
|
comp, buf, &trans);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
/*
|
|
* Whenever we share an extent we update the last_reflink_trans
|
|
* of each inode to the current transaction. This is needed to
|
|
* make sure fsync does not log multiple checksum items with
|
|
* overlapping ranges (because some extent items might refer
|
|
* only to sections of the original extent). For the destination
|
|
* inode we do this regardless of the generation of the extents
|
|
* or even if they are inline extents or explicit holes, to make
|
|
* sure a full fsync does not skip them. For the source inode,
|
|
* we only need to update last_reflink_trans in case it's a new
|
|
* extent that is not a hole or an inline extent, to deal with
|
|
* the checksums problem on fsync.
|
|
*/
|
|
if (extent_gen == trans->transid && disko > 0)
|
|
BTRFS_I(src)->last_reflink_trans = trans->transid;
|
|
|
|
BTRFS_I(inode)->last_reflink_trans = trans->transid;
|
|
|
|
last_dest_end = ALIGN(new_key.offset + datal,
|
|
fs_info->sectorsize);
|
|
ret = clone_finish_inode_update(trans, inode, last_dest_end,
|
|
destoff, olen, no_time_update);
|
|
if (ret)
|
|
goto out;
|
|
if (new_key.offset + datal >= destoff + len)
|
|
break;
|
|
|
|
btrfs_release_path(path);
|
|
key.offset = prev_extent_end;
|
|
|
|
if (fatal_signal_pending(current)) {
|
|
ret = -EINTR;
|
|
goto out;
|
|
}
|
|
|
|
cond_resched();
|
|
}
|
|
ret = 0;
|
|
|
|
if (last_dest_end < destoff + len) {
|
|
/*
|
|
* We have an implicit hole that fully or partially overlaps our
|
|
* cloning range at its end. This means that we either have the
|
|
* NO_HOLES feature enabled or the implicit hole happened due to
|
|
* mixing buffered and direct IO writes against this file.
|
|
*/
|
|
btrfs_release_path(path);
|
|
|
|
/*
|
|
* When using NO_HOLES and we are cloning a range that covers
|
|
* only a hole (no extents) into a range beyond the current
|
|
* i_size, punching a hole in the target range will not create
|
|
* an extent map defining a hole, because the range starts at or
|
|
* beyond current i_size. If the file previously had an i_size
|
|
* greater than the new i_size set by this clone operation, we
|
|
* need to make sure the next fsync is a full fsync, so that it
|
|
* detects and logs a hole covering a range from the current
|
|
* i_size to the new i_size. If the clone range covers extents,
|
|
* besides a hole, then we know the full sync flag was already
|
|
* set by previous calls to btrfs_replace_file_extents() that
|
|
* replaced file extent items.
|
|
*/
|
|
if (last_dest_end >= i_size_read(inode))
|
|
btrfs_set_inode_full_sync(BTRFS_I(inode));
|
|
|
|
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
|
|
last_dest_end, destoff + len - 1, NULL, &trans);
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = clone_finish_inode_update(trans, inode, destoff + len,
|
|
destoff, olen, no_time_update);
|
|
}
|
|
|
|
out:
|
|
clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
|
|
{
|
|
if (inode1 < inode2)
|
|
swap(inode1, inode2);
|
|
down_write(&inode1->i_mmap_lock);
|
|
down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
|
|
}
|
|
|
|
static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
|
|
{
|
|
up_write(&inode1->i_mmap_lock);
|
|
up_write(&inode2->i_mmap_lock);
|
|
}
|
|
|
|
static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
|
|
struct btrfs_inode *dst, u64 dst_loff)
|
|
{
|
|
const u64 end = dst_loff + len - 1;
|
|
struct extent_state *cached_state = NULL;
|
|
struct btrfs_fs_info *fs_info = src->root->fs_info;
|
|
const u64 bs = fs_info->sectorsize;
|
|
int ret;
|
|
|
|
/*
|
|
* Lock destination range to serialize with concurrent readahead(), and
|
|
* we are safe from concurrency with relocation of source extents
|
|
* because we have already locked the inode's i_mmap_lock in exclusive
|
|
* mode.
|
|
*/
|
|
btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
|
|
ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
|
|
ALIGN(len, bs), dst_loff, true);
|
|
btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
|
|
struct inode *dst, u64 dst_loff)
|
|
{
|
|
int ret = 0;
|
|
u64 i, tail_len, chunk_count;
|
|
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
|
|
|
|
spin_lock(&root_dst->root_item_lock);
|
|
if (root_dst->send_in_progress) {
|
|
btrfs_warn_rl(root_dst->fs_info,
|
|
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
|
|
btrfs_root_id(root_dst),
|
|
root_dst->send_in_progress);
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
return -EAGAIN;
|
|
}
|
|
root_dst->dedupe_in_progress++;
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
|
|
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
|
|
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
|
|
|
|
for (i = 0; i < chunk_count; i++) {
|
|
ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
|
|
BTRFS_I(dst), dst_loff);
|
|
if (ret)
|
|
goto out;
|
|
|
|
loff += BTRFS_MAX_DEDUPE_LEN;
|
|
dst_loff += BTRFS_MAX_DEDUPE_LEN;
|
|
}
|
|
|
|
if (tail_len > 0)
|
|
ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
|
|
BTRFS_I(dst), dst_loff);
|
|
out:
|
|
spin_lock(&root_dst->root_item_lock);
|
|
root_dst->dedupe_in_progress--;
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
|
|
u64 off, u64 olen, u64 destoff)
|
|
{
|
|
struct extent_state *cached_state = NULL;
|
|
struct inode *inode = file_inode(file);
|
|
struct inode *src = file_inode(file_src);
|
|
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
|
|
int ret;
|
|
u64 len = olen;
|
|
u64 bs = fs_info->sectorsize;
|
|
u64 end;
|
|
|
|
/*
|
|
* VFS's generic_remap_file_range_prep() protects us from cloning the
|
|
* eof block into the middle of a file, which would result in corruption
|
|
* if the file size is not blocksize aligned. So we don't need to check
|
|
* for that case here.
|
|
*/
|
|
if (off + len == src->i_size)
|
|
len = ALIGN(src->i_size, bs) - off;
|
|
|
|
if (destoff > inode->i_size) {
|
|
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
|
|
|
|
ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
|
|
if (ret)
|
|
return ret;
|
|
/*
|
|
* We may have truncated the last block if the inode's size is
|
|
* not sector size aligned, so we need to wait for writeback to
|
|
* complete before proceeding further, otherwise we can race
|
|
* with cloning and attempt to increment a reference to an
|
|
* extent that no longer exists (writeback completed right after
|
|
* we found the previous extent covering eof and before we
|
|
* attempted to increment its reference count).
|
|
*/
|
|
ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
|
|
destoff - wb_start);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Lock destination range to serialize with concurrent readahead(), and
|
|
* we are safe from concurrency with relocation of source extents
|
|
* because we have already locked the inode's i_mmap_lock in exclusive
|
|
* mode.
|
|
*/
|
|
end = destoff + len - 1;
|
|
btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
|
|
ret = btrfs_clone(src, inode, off, olen, len, destoff, false);
|
|
btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
/*
|
|
* We may have copied an inline extent into a page of the destination
|
|
* range. So flush delalloc and wait for ordered extent completion.
|
|
* This is to ensure the invalidation below does not fail, as if for
|
|
* example it finds a dirty folio, our folio release callback
|
|
* (btrfs_release_folio()) returns false, which makes the invalidation
|
|
* return an -EBUSY error. We can't ignore such failures since they
|
|
* could come from some range other than the copied inline extent's
|
|
* destination range and we have no way to know that.
|
|
*/
|
|
ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
/*
|
|
* Invalidate page cache so that future reads will see the cloned data
|
|
* immediately and not the previous data.
|
|
*/
|
|
ret = filemap_invalidate_inode(inode, false, destoff, end);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
|
struct file *file_out, loff_t pos_out,
|
|
loff_t *len, unsigned int remap_flags)
|
|
{
|
|
struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
|
|
struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
|
|
u64 bs = inode_out->root->fs_info->sectorsize;
|
|
u64 wb_len;
|
|
int ret;
|
|
|
|
if (!(remap_flags & REMAP_FILE_DEDUP)) {
|
|
struct btrfs_root *root_out = inode_out->root;
|
|
|
|
if (btrfs_root_readonly(root_out))
|
|
return -EROFS;
|
|
|
|
ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
|
|
}
|
|
|
|
/* Can only reflink encrypted files if both files are encrypted. */
|
|
if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
|
|
return -EINVAL;
|
|
|
|
/* Don't make the dst file partly checksummed */
|
|
if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
|
|
(inode_out->flags & BTRFS_INODE_NODATASUM)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
/*
|
|
* Now that the inodes are locked, we need to start writeback ourselves
|
|
* and can not rely on the writeback from the VFS's generic helper
|
|
* generic_remap_file_range_prep() because:
|
|
*
|
|
* 1) For compression we must call filemap_fdatawrite_range() range
|
|
* twice (btrfs_fdatawrite_range() does it for us), and the generic
|
|
* helper only calls it once;
|
|
*
|
|
* 2) filemap_fdatawrite_range(), called by the generic helper only
|
|
* waits for the writeback to complete, i.e. for IO to be done, and
|
|
* not for the ordered extents to complete. We need to wait for them
|
|
* to complete so that new file extent items are in the fs tree.
|
|
*/
|
|
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
|
|
wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
|
|
else
|
|
wb_len = ALIGN(*len, bs);
|
|
|
|
/*
|
|
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
|
|
*
|
|
* Btrfs' back references do not have a block level granularity, they
|
|
* work at the whole extent level.
|
|
* NOCOW buffered write without data space reserved may not be able
|
|
* to fall back to CoW due to lack of data space, thus could cause
|
|
* data loss.
|
|
*
|
|
* Here we take a shortcut by flushing the whole inode, so that all
|
|
* nocow write should reach disk as nocow before we increase the
|
|
* reference of the extent. We could do better by only flushing NOCOW
|
|
* data, but that needs extra accounting.
|
|
*
|
|
* Also we don't need to check ASYNC_EXTENT, as async extent will be
|
|
* CoWed anyway, not affecting nocow part.
|
|
*/
|
|
ret = filemap_flush(inode_in->vfs_inode.i_mapping);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
|
|
if (ret < 0)
|
|
return ret;
|
|
ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
|
|
len, remap_flags);
|
|
}
|
|
|
|
static bool file_sync_write(const struct file *file)
|
|
{
|
|
if (file->f_flags & (__O_SYNC | O_DSYNC))
|
|
return true;
|
|
if (IS_SYNC(file_inode(file)))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
|
|
struct file *dst_file, loff_t destoff, loff_t len,
|
|
unsigned int remap_flags)
|
|
{
|
|
struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
|
|
struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
|
|
bool same_inode = dst_inode == src_inode;
|
|
int ret;
|
|
|
|
if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))
|
|
return -EIO;
|
|
|
|
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
|
|
return -EINVAL;
|
|
|
|
if (same_inode) {
|
|
btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
|
|
} else {
|
|
lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
|
|
btrfs_double_mmap_lock(src_inode, dst_inode);
|
|
}
|
|
|
|
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
|
|
&len, remap_flags);
|
|
if (ret < 0 || len == 0)
|
|
goto out_unlock;
|
|
|
|
if (remap_flags & REMAP_FILE_DEDUP)
|
|
ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
|
|
&dst_inode->vfs_inode, destoff);
|
|
else
|
|
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
|
|
|
|
out_unlock:
|
|
if (same_inode) {
|
|
btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
|
|
} else {
|
|
btrfs_double_mmap_unlock(src_inode, dst_inode);
|
|
unlock_two_nondirectories(&src_inode->vfs_inode,
|
|
&dst_inode->vfs_inode);
|
|
}
|
|
|
|
/*
|
|
* If either the source or the destination file was opened with O_SYNC,
|
|
* O_DSYNC or has the S_SYNC attribute, fsync both the destination and
|
|
* source files/ranges, so that after a successful return (0) followed
|
|
* by a power failure results in the reflinked data to be readable from
|
|
* both files/ranges.
|
|
*/
|
|
if (ret == 0 && len > 0 &&
|
|
(file_sync_write(src_file) || file_sync_write(dst_file))) {
|
|
ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
|
|
if (ret == 0)
|
|
ret = btrfs_sync_file(dst_file, destoff,
|
|
destoff + len - 1, 0);
|
|
}
|
|
|
|
return ret < 0 ? ret : len;
|
|
}
|