diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3dfee94e0618..cbeac1bebe2f 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -377,3 +377,35 @@ Description: This gives a control to limit the bio size in f2fs. Default is zero, which will follow underlying block layer limit, whereas, if it has a certain bytes value, f2fs won't submit a bio larger than that size. + +What: /sys/fs/f2fs//stat/sb_status +Date: December 2020 +Contact: "Chao Yu" +Description: Show status of f2fs superblock in real time. + + ====== ===================== ================================= + value sb status macro description + 0x1 SBI_IS_DIRTY dirty flag for checkpoint + 0x2 SBI_IS_CLOSE specify unmounting + 0x4 SBI_NEED_FSCK need fsck.f2fs to fix + 0x8 SBI_POR_DOING recovery is doing or not + 0x10 SBI_NEED_SB_WRITE need to recover superblock + 0x20 SBI_NEED_CP need to checkpoint + 0x40 SBI_IS_SHUTDOWN shutdown by ioctl + 0x80 SBI_IS_RECOVERED recovered orphan/data + 0x100 SBI_CP_DISABLED CP was disabled last mount + 0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly + 0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP + 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP + 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted + 0x2000 SBI_IS_RESIZEFS resizefs is in process + ====== ===================== ================================= + +What: /sys/fs/f2fs//ckpt_thread_ioprio +Date: January 2021 +Contact: "Daeho Jeong" +Description: Give a way to change checkpoint merge daemon's io priority. + Its default value is "be,3", which means "BE" I/O class and + I/O priority "3". We can select the class between "rt" and "be", + and set the I/O priority within valid range of it. "," delimiter + is necessary in between I/O class and priority number. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index dae15c96e659..81c05baa8312 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -247,8 +247,24 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl hide up to all remaining free space. The actual space that would be unusable can be viewed at /sys/fs/f2fs//unusable This space is reclaimed once checkpoint=enable. +checkpoint_merge When checkpoint is enabled, this can be used to create a kernel + daemon and make it to merge concurrent checkpoint requests as + much as possible to eliminate redundant checkpoint issues. Plus, + we can eliminate the sluggish issue caused by slow checkpoint + operation when the checkpoint is done in a process context in + a cgroup having low i/o budget and cpu shares. To make this + do better, we set the default i/o priority of the kernel daemon + to "3", to give one higher priority than other kernel threads. + This is the same way to give a I/O priority to the jbd2 + journaling thread of ext4 filesystem. +nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. +compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only + "lz4" and "zstd" support compress level config. + algorithm level range + lz4 3 - 16 + zstd 1 - 22 compress_log_size=%u Support configuring compress cluster size, the size will be 4KB * (1 << %u), 16KB is minimum size, also it's default size. @@ -831,7 +847,7 @@ This is the default option. f2fs does automatic compression in the writeback of compression enabled files. 2) compress_mode=user -This disables the automaic compression and gives the user discretion of choosing the +This disables the automatic compression and gives the user discretion of choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below. diff --git a/fs/Kconfig b/fs/Kconfig index a6a721108d1c..452e4ff57430 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -334,6 +334,10 @@ config NFS_COMMON depends on NFSD || NFS_FS || LOCKD default y +config NFS_V4_2_SSC_HELPER + tristate + default y if NFS_V4=y || NFS_FS=y + source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" diff --git a/fs/erofs/super.c b/fs/erofs/super.c index be10b16ea66e..d5a6b9b888a5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -158,8 +158,8 @@ static int erofs_read_superblock(struct super_block *sb) blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blksize %u isn't supported on this platform", - 1 << blkszbits); + erofs_err(sb, "blkszbits %u isn't supported on this platform", + blkszbits); goto out; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 58cf0cf1b818..6330bca9d91d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -48,8 +48,14 @@ static int init_inode_xattrs(struct inode *inode) int ret = 0; /* the most case is that xattrs of this inode are initialized. */ - if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -137,6 +143,8 @@ static int init_inode_xattrs(struct inode *inode) } xattr_iter_end(&it, atomic_map); + /* paired with smp_mb() at the beginning of the function. */ + smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); out_unlock: diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index ae325541884e..14d2de35110c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -36,8 +36,14 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) void *kaddr; struct z_erofs_map_header *h; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -83,6 +89,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + ((h->h_clusterbits >> 5) & 7); + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); unmap_done: kunmap_atomic(kaddr); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index d13c5c6a9787..62e638a49bbf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -76,16 +76,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS @@ -119,6 +109,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 897edb7c951a..174a0819ad96 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,13 +13,15 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -443,7 +445,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1018,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) @@ -1707,3 +1707,174 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4bcbacfe3325..77fa342de38f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), @@ -721,38 +756,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -764,20 +788,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -816,18 +840,34 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1415,7 +1455,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1450,6 +1490,9 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { @@ -1494,6 +1537,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1512,12 +1557,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1546,7 +1593,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1574,30 +1621,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d1e83f119338..fc310175eba5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,7 +25,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include #include @@ -116,10 +115,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -129,25 +139,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; struct bvec_iter_all iter_all; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -158,106 +169,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } -} -#endif - -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; - } -#endif + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) @@ -265,74 +274,36 @@ static void f2fs_post_read_work(struct work_struct *work) struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ - - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, false, false); -} - -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) @@ -504,7 +475,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; @@ -712,7 +683,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -917,7 +887,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -1014,7 +983,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: @@ -1027,16 +995,9 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1055,13 +1016,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; - if (post_read_steps) { + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1073,13 +1040,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1088,7 +1048,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1969,6 +1929,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; @@ -2126,7 +2087,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2172,8 +2133,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2239,29 +2198,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2277,31 +2217,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2313,10 +2232,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2333,7 +2251,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } @@ -2342,11 +2266,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. - * - * Note that the aops->readpages() function is ONLY used for read-ahead. If - * this function ever deviates from doing just read-ahead, it should either - * use ->readpage() or do the necessary surgery to decouple ->readpages() - * from read-ahead. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) @@ -2369,7 +2288,6 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; - bool drop_ra = false; map.m_pblk = 0; map.m_lblk = 0; @@ -2380,26 +2298,10 @@ static int f2fs_mpage_readpages(struct inode *inode, map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - /* - * Two readahead threads for same address range can cause race condition - * which fragments sequential read IOs. So let's avoid each other. - */ - if (rac && readahead_count(rac)) { - if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac)) - drop_ra = true; - else - WRITE_ONCE(F2FS_I(inode)->ra_offset, - readahead_index(rac)); - } - for (; nr_pages; nr_pages--) { if (rac) { page = readahead_page(rac); prefetchw(&page->flags); - if (drop_ra) { - f2fs_put_page(page, 1); - continue; - } } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2462,9 +2364,6 @@ static int f2fs_mpage_readpages(struct inode *inode, } if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - - if (rac && readahead_count(rac) && !drop_ra) - WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); return ret; } @@ -2748,7 +2647,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2886,7 +2786,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2933,7 +2833,7 @@ static int f2fs_write_data_page(struct page *page, #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -3101,7 +3001,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -3877,7 +3778,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { @@ -4154,12 +4055,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5130423a13e7..f0aec024e5df 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -146,6 +147,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -266,6 +268,26 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -717,7 +739,6 @@ struct f2fs_inode_info { struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ - pgoff_t ra_offset; /* ongoing readahead offset */ struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ @@ -735,6 +756,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1310,6 +1332,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -1337,7 +1361,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1353,11 +1377,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -1404,6 +1454,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1444,7 +1495,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -1541,9 +1591,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ @@ -3232,6 +3285,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -3418,6 +3472,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c @@ -3469,7 +3527,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3530,6 +3588,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; @@ -3715,8 +3775,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) @@ -3876,7 +3934,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3889,9 +3947,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3915,6 +3972,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -3934,6 +3999,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); @@ -4118,6 +4188,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f585545277d7..8e53f8898688 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include #include @@ -60,6 +59,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -70,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -366,7 +372,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } @@ -483,6 +488,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -502,7 +510,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -510,11 +517,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); @@ -667,7 +669,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -767,6 +769,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -848,7 +854,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -865,6 +872,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -949,8 +964,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } @@ -2730,7 +2747,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3293,7 +3310,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; @@ -4043,8 +4060,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); @@ -4349,6 +4368,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4413,6 +4437,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0a8f64feefe4..e8281a01449d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -210,6 +210,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6edb1ab579a1..887804968576 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -855,7 +855,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1041,7 +1045,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a24423ac65f..a8a0fb890e8d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -2696,7 +2694,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index deca74cb17df..440634dfaa56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -610,8 +607,6 @@ static int issue_flush_thread(void *data) if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -632,8 +627,6 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 972736d71fa4..30d5abef4361 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,13 +25,14 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -143,6 +144,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -213,6 +216,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -464,6 +469,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -872,6 +945,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -882,17 +961,45 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; - } else if (!strcmp(name, "lz4")) { +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - } else if (!strcmp(name, "zstd")) { +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif } else { kfree(name); return -EINVAL; @@ -1076,8 +1183,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->ra_offset = -1; - return &fi->vfs_inode; } @@ -1245,6 +1350,12 @@ static void f2fs_put_super(struct super_block *sb) /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -1343,16 +1454,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; - - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } - f2fs_trace_ios(NULL, 1); + if (sync) + err = f2fs_issue_checkpoint(sbi); return err; } @@ -1369,6 +1472,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } @@ -1539,6 +1646,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); @@ -1674,6 +1784,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1957,6 +2071,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2641,10 +2768,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2653,6 +2780,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3536,8 +3668,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; @@ -3704,6 +3835,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3789,12 +3933,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } @@ -3913,6 +4055,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -4027,8 +4171,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4121,7 +4263,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 30bae57428d1..e38a7f6921dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -96,6 +100,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -255,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -308,6 +335,38 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -567,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -652,6 +712,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), @@ -702,6 +763,13 @@ static struct attribute *f2fs_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_feat); +F2FS_GENERAL_RO_ATTR(sb_status); +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -730,6 +798,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -936,11 +1042,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -956,6 +1066,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -967,6 +1084,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index eb892dbe85e3..d0313592a8ae 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; diff --git a/fs/libfs.c b/fs/libfs.c index d1c3bade9f30..79721571e014 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1388,8 +1388,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1426,7 +1426,6 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1435,7 +1434,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1450,7 +1449,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index fa41dda39925..4c10fb5138f1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -512,6 +512,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlm4svc_proc_test, @@ -520,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlm4svc_proc_lock, @@ -528,6 +530,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlm4svc_proc_cancel, @@ -536,6 +539,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlm4svc_proc_unlock, @@ -544,6 +548,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlm4svc_proc_granted, @@ -552,6 +557,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlm4svc_proc_test_msg, @@ -560,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlm4svc_proc_lock_msg, @@ -568,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlm4svc_proc_cancel_msg, @@ -576,6 +584,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlm4svc_proc_unlock_msg, @@ -584,6 +593,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlm4svc_proc_granted_msg, @@ -592,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlm4svc_proc_null, @@ -600,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -608,6 +620,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlm4svc_proc_null, @@ -616,6 +629,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -624,6 +638,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlm4svc_proc_granted_res, @@ -632,6 +647,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlm4svc_proc_sm_notify, @@ -640,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlm4svc_proc_unused, @@ -648,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlm4svc_proc_unused, @@ -656,6 +674,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlm4svc_proc_unused, @@ -664,6 +683,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlm4svc_proc_share, @@ -672,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlm4svc_proc_unshare, @@ -680,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlm4svc_proc_nm_lock, @@ -688,6 +710,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlm4svc_proc_free_all, @@ -696,5 +719,6 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 50855f2c1f4b..4ae4b63b5392 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -554,6 +554,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlmsvc_proc_test, @@ -562,6 +563,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlmsvc_proc_lock, @@ -570,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlmsvc_proc_cancel, @@ -578,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlmsvc_proc_unlock, @@ -586,6 +590,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlmsvc_proc_granted, @@ -594,6 +599,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlmsvc_proc_test_msg, @@ -602,6 +608,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlmsvc_proc_lock_msg, @@ -610,6 +617,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlmsvc_proc_cancel_msg, @@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlmsvc_proc_unlock_msg, @@ -626,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlmsvc_proc_granted_msg, @@ -634,6 +644,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlmsvc_proc_null, @@ -642,6 +653,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -650,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlmsvc_proc_null, @@ -658,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -666,6 +680,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlmsvc_proc_granted_res, @@ -674,6 +689,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlmsvc_proc_sm_notify, @@ -682,6 +698,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlmsvc_proc_unused, @@ -690,6 +707,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlmsvc_proc_unused, @@ -698,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlmsvc_proc_unused, @@ -706,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlmsvc_proc_share, @@ -714,6 +734,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlmsvc_proc_unshare, @@ -722,6 +743,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlmsvc_proc_nm_lock, @@ -730,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlmsvc_proc_free_all, @@ -738,5 +761,6 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 79ff172eb1c8..c5348ba81129 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1060,6 +1060,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, + .pc_name = "NULL", }, [CB_COMPOUND] = { .pc_func = nfs4_callback_compound, @@ -1067,6 +1068,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_argsize = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + .pc_name = "COMPOUND", } }; diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 7412bb164fa7..f2b34cfe286c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -167,10 +167,28 @@ nfs_get_parent(struct dentry *dentry) return parent; } +static u64 nfs_fetch_iversion(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + + /* Is this the right call?: */ + nfs_revalidate_inode(server, inode); + /* + * Also, note we're ignoring any returned error. That seems to be + * the practice for cache consistency information elsewhere in + * the server, but I'm not sure why. + */ + if (server->nfs_client->rpc_ops->version >= 4) + return inode_peek_iversion_raw(inode); + else + return time_to_chattr(&inode->i_ctime); +} + const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 57b3821d975a..441a2fa073c8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -420,7 +420,9 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { */ void nfs42_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); +#endif } /** @@ -431,7 +433,9 @@ void nfs42_ssc_register_ops(void) */ void nfs42_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); +#endif } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4034102010f0..c7a924580eec 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -86,9 +86,11 @@ const struct super_operations nfs_sops = { }; EXPORT_SYMBOL_GPL(nfs_sops); +#ifdef CONFIG_NFS_V4_2 static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = { .sco_sb_deactive = nfs_sb_deactive, }; +#endif #if IS_ENABLED(CONFIG_NFS_V4) static int __init register_nfs4_fs(void) @@ -111,15 +113,21 @@ static void unregister_nfs4_fs(void) } #endif +#ifdef CONFIG_NFS_V4_2 static void nfs_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); +#endif } static void nfs_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); +#endif } +#endif /* CONFIG_NFS_V4_2 */ static struct shrinker acl_shrinker = { .count_objects = nfs_access_cache_count, @@ -148,7 +156,9 @@ int __init register_nfs_fs(void) ret = register_shrinker(&acl_shrinker); if (ret < 0) goto error_3; +#ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); +#endif return 0; error_3: nfs_unregister_sysctl(); @@ -168,7 +178,9 @@ void __exit unregister_nfs_fs(void) unregister_shrinker(&acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); +#ifdef CONFIG_NFS_V4_2 nfs_ssc_unregister_ops(); +#endif unregister_filesystem(&nfs_fs_type); } diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index fa82f5aaa6d9..119c75ab9fd0 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o nfs_acl-objs := nfsacl.o obj-$(CONFIG_GRACE_PERIOD) += grace.o -obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o +obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c index f43bbb373913..7c1509e968c8 100644 --- a/fs/nfs_common/nfs_ssc.c +++ b/fs/nfs_common/nfs_ssc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * fs/nfs_common/nfs_ssc_comm.c - * * Helper for knfsd's SSC to access ops in NFS client modules * * Author: Dai Ngo diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index d056ad2fdefd..79c563c1a5e8 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -295,3 +295,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, nfsacl_desc.desc.array_len; } EXPORT_SYMBOL_GPL(nfsacl_decode); + +/** + * nfs_stream_decode_acl - Decode an NFSv3 ACL + * + * @xdr: an xdr_stream positioned at an encoded ACL + * @aclcnt: OUT: count of ACEs in decoded posix_acl + * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl + * + * Return values: + * %false: The encoded ACL is not valid + * %true: @pacl contains a decoded ACL, and @xdr is advanced + * + * On a successful return, caller must release *pacl using posix_acl_release(). + */ +bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + const size_t elem_size = XDR_UNIT * 3; + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = elem_size, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + unsigned int base; + u32 entries; + + if (xdr_stream_decode_u32(xdr, &entries) < 0) + return false; + if (entries > NFS_ACL_MAX_ENTRIES) + return false; + + base = xdr_stream_pos(xdr); + if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries)) + return false; + nfsacl_desc.desc.array_maxlen = entries; + if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc)) + return false; + + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return false; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return true; +} +EXPORT_SYMBOL_GPL(nfs_stream_decode_acl); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index dbbc583d6273..821e5913faee 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -76,6 +76,7 @@ config NFSD_V4 select CRYPTO_MD5 select CRYPTO_SHA256 select GRACE_PERIOD + select NFS_V4_2_SSC_HELPER if NFS_V4_2 help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 81e7bb12aca6..7c863f2c21e0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -331,12 +331,29 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) fsloc->locations = NULL; } +static int export_stats_init(struct export_stats *stats) +{ + stats->start_time = ktime_get_seconds(); + return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_reset(struct export_stats *stats) +{ + nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_destroy(struct export_stats *stats) +{ + nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); +} + static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); + export_stats_destroy(&exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -692,22 +709,47 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); +static int is_export_stats_file(struct seq_file *m) +{ + /* + * The export_stats file uses the same ops as the exports file. + * We use the file's name to determine the reported info per export. + * There is no rename in nsfdfs, so d_name.name is stable. + */ + return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); +} + static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { - struct svc_export *exp ; + struct svc_export *exp; + bool export_stats = is_export_stats_file(m); - if (h ==NULL) { - seq_puts(m, "#path domain(flags)\n"); + if (h == NULL) { + if (export_stats) + seq_puts(m, "#path domain start-time\n#\tstats\n"); + else + seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); + if (export_stats) { + seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + seq_printf(m, "\tfh_stale: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + seq_printf(m, "\tio_read: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + seq_printf(m, "\tio_write: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + seq_putc(m, '\n'); + return 0; + } seq_putc(m, '('); - if (test_bit(CACHE_VALID, &h->flags) && + if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); @@ -748,6 +790,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; + export_stats_reset(&new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -780,10 +823,15 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); - if (i) - return &i->h; - else + if (!i) return NULL; + + if (export_stats_init(&i->ex_stats)) { + kfree(i); + return NULL; + } + + return &i->h; } static const struct cache_detail svc_export_cache_template = { @@ -1245,10 +1293,14 @@ static int e_show(struct seq_file *m, void *p) struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; + bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); - seq_puts(m, "# Path Client(Flags) # IPs\n"); + if (export_stats) + seq_puts(m, "# Path Client Start-time\n#\tStats\n"); + else + seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index e7daa1f246f0..ee0e3aba4a6e 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #define NFSD_EXPORT_H #include +#include #include #include @@ -46,6 +47,19 @@ struct exp_flavor_info { u32 flags; }; +/* Per-export stats */ +enum { + EXP_STATS_FH_STALE, + EXP_STATS_IO_READ, + EXP_STATS_IO_WRITE, + EXP_STATS_COUNTERS_NUM +}; + +struct export_stats { + time64_t start_time; + struct percpu_counter counter[EXP_STATS_COUNTERS_NUM]; +}; + struct svc_export { struct cache_head h; struct auth_domain * ex_client; @@ -62,6 +76,7 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; + struct export_stats ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 7346acda9d76..c330f5bd0cf3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -21,6 +22,14 @@ struct cld_net; struct nfsd4_client_tracking_ops; +enum { + /* cache misses due only to checksum comparison failures */ + NFSD_NET_PAYLOAD_MISSES, + /* amount of memory (in bytes) currently consumed by the DRC */ + NFSD_NET_DRC_MEM_USAGE, + NFSD_NET_COUNTERS_NUM +}; + /* * Represents a nfsd "container". With respect to nfsv4 state tracking, the * fields of interest are the *_id_hashtbls and the *_name_tree. These track @@ -149,20 +158,16 @@ struct nfsd_net { /* * Stats and other tracking of on the duplicate reply cache. - * These fields and the "rc" fields in nfsdstats are modified - * with only the per-bucket cache lock, which isn't really safe - * and should be fixed if we want the statistics to be - * completely accurate. + * The longest_chain* fields are modified with only the per-bucket + * cache lock, which isn't really safe and should be fixed if we want + * these statistics to be completely accurate. */ /* total number of entries */ atomic_t num_drc_entries; - /* cache misses due only to checksum comparison failures */ - unsigned int payload_misses; - - /* amount of memory (in bytes) currently consumed by the DRC */ - unsigned int drc_mem_usage; + /* Per-netns stats counters */ + struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b0f66604532a..7eeac5b81c20 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -188,63 +188,49 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *argp = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) return 0; - argp->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_setaclargs *argp = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) return 0; - argp->mask = ntohl(*p++); - if (argp->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return 0; + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) return 0; - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (argp->mask & NFS_ACL) ? - &argp->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (argp->mask & NFS_DFACL) ? - &argp->acl_default : NULL); - return (n > 0); -} - -static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_fhandle *argp = rqstp->rq_argp; - - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return 1; } static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_accessargs *args = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - argp->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } /* @@ -371,6 +357,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC2_GETACL] = { .pc_func = nfsacld_proc_getacl, @@ -381,6 +368,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC2_SETACL] = { .pc_func = nfsacld_proc_setacl, @@ -391,16 +379,18 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "SETACL", }, [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, - .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfsaclsvc_encode_attrstatres, .pc_release = nfsaclsvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [ACLPROC2_ACCESS] = { .pc_func = nfsacld_proc_access, @@ -411,6 +401,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1, + .pc_name = "SETATTR", }, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 7c30876a31a1..a568b842e9eb 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -124,43 +124,39 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) /* * XDR decode functions */ + static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *args = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->mask) < 0) return 0; - args->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_setaclargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_setaclargs *argp = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh)) return 0; - args->mask = ntohl(*p++); - if (args->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return 0; + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) return 0; - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (args->mask & NFS_ACL) ? - &args->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (args->mask & NFS_DFACL) ? - &args->acl_default : NULL); - return (n > 0); + return 1; } /* @@ -251,6 +247,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC3_GETACL] = { .pc_func = nfsd3_proc_getacl, @@ -261,6 +258,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC3_SETACL] = { .pc_func = nfsd3_proc_setacl, @@ -271,6 +269,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT, + .pc_name = "SETACL", }, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 76931f4f57c3..8675851199f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -124,15 +124,16 @@ nfsd3_proc_access(struct svc_rqst *rqstp) static __be32 nfsd3_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd3_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len); return rpc_success; } @@ -144,25 +145,38 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - u32 max_blocksize = svc_max_payload(rqstp); - unsigned long cnt = min(argp->count, max_blocksize); + u32 max_blocksize = svc_max_payload(rqstp); + unsigned int len; + int v; + + argp->count = min_t(u32, argp->count, max_blocksize); dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - resp->count = cnt; + resp->count = argp->count; svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, &resp->count, - &resp->eof); + rqstp->rq_vec, v, &resp->count, &resp->eof); return rpc_success; } @@ -421,6 +435,23 @@ nfsd3_proc_link(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd3_readdirres *resp, + int count) +{ + count = min_t(u32, count, svc_max_payload(rqstp)); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + while (count > 0) { + rqstp->rq_next_page++; + count -= PAGE_SIZE; + } +} + /* * Read a portion of a directory. */ @@ -430,6 +461,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; int count = 0; + loff_t offset; struct page **p; caddr_t page_addr = NULL; @@ -437,18 +469,16 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Make sure we've room for the NULL ptr & eof flag, and shrink to - * client read size */ - count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->buflen = count; resp->common.err = nfs_ok; - resp->buffer = argp->buffer; resp->rqstp = rqstp; - resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie, + offset = argp->cookie; + + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); count = 0; @@ -464,8 +494,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) } resp->count = count >> 2; if (resp->offset) { - loff_t offset = argp->cookie; - if (unlikely(resp->offset1)) { /* we ended up with offset on a page boundary */ *resp->offset = htonl(offset >> 32); @@ -498,16 +526,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); resp->common.err = nfs_ok; - resp->buffer = argp->buffer; - resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; @@ -683,7 +707,6 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) * NFSv3 Server procedures. * Only the results of non-idempotent operations are cached. */ -#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle #define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat #define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat #define nfsd3_mkdirargs nfsd3_createargs @@ -708,16 +731,18 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_attrstatres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFS3PROC_SETATTR] = { .pc_func = nfsd3_proc_setattr, @@ -728,6 +753,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "SETATTR", }, [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, @@ -738,6 +764,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, + .pc_name = "LOOKUP", }, [NFS3PROC_ACCESS] = { .pc_func = nfsd3_proc_access, @@ -748,16 +775,18 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, + .pc_name = "ACCESS", }, [NFS3PROC_READLINK] = { .pc_func = nfsd3_proc_readlink, - .pc_decode = nfs3svc_decode_readlinkargs, + .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFS3PROC_READ] = { .pc_func = nfsd3_proc_read, @@ -768,6 +797,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + .pc_name = "READ", }, [NFS3PROC_WRITE] = { .pc_func = nfsd3_proc_write, @@ -778,6 +808,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, + .pc_name = "WRITE", }, [NFS3PROC_CREATE] = { .pc_func = nfsd3_proc_create, @@ -788,6 +819,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "CREATE", }, [NFS3PROC_MKDIR] = { .pc_func = nfsd3_proc_mkdir, @@ -798,6 +830,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKDIR", }, [NFS3PROC_SYMLINK] = { .pc_func = nfsd3_proc_symlink, @@ -808,6 +841,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "SYMLINK", }, [NFS3PROC_MKNOD] = { .pc_func = nfsd3_proc_mknod, @@ -818,6 +852,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKNOD", }, [NFS3PROC_REMOVE] = { .pc_func = nfsd3_proc_remove, @@ -828,6 +863,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "REMOVE", }, [NFS3PROC_RMDIR] = { .pc_func = nfsd3_proc_rmdir, @@ -838,6 +874,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "RMDIR", }, [NFS3PROC_RENAME] = { .pc_func = nfsd3_proc_rename, @@ -848,6 +885,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, + .pc_name = "RENAME", }, [NFS3PROC_LINK] = { .pc_func = nfsd3_proc_link, @@ -858,6 +896,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, + .pc_name = "LINK", }, [NFS3PROC_READDIR] = { .pc_func = nfsd3_proc_readdir, @@ -867,6 +906,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFS3PROC_READDIRPLUS] = { .pc_func = nfsd3_proc_readdirplus, @@ -876,6 +916,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIRPLUS", }, [NFS3PROC_FSSTAT] = { .pc_func = nfsd3_proc_fsstat, @@ -885,6 +926,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, + .pc_name = "FSSTAT", }, [NFS3PROC_FSINFO] = { .pc_func = nfsd3_proc_fsinfo, @@ -894,6 +936,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, + .pc_name = "FSINFO", }, [NFS3PROC_PATHCONF] = { .pc_func = nfsd3_proc_pathconf, @@ -903,6 +946,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, + .pc_name = "PATHCONF", }, [NFS3PROC_COMMIT] = { .pc_func = nfsd3_proc_commit, @@ -913,6 +957,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, + .pc_name = "COMMIT", }, }; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 34b880211e5e..9d9a01ce0b27 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -29,8 +29,9 @@ static u32 nfs3_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6) */ + static __be32 * encode_time3(__be32 *p, struct timespec64 *time) { @@ -38,32 +39,47 @@ encode_time3(__be32 *p, struct timespec64 *time) return p; } -static __be32 * -decode_time3(__be32 *p, struct timespec64 *time) +static bool +svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep) { - time->tv_sec = ntohl(*p++); - time->tv_nsec = ntohl(*p++); - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p); + + return true; } -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle + * @xdr: XDR stream positioned at an undecoded NFSv3 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) { - unsigned int size; + __be32 *p; + u32 size; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_FHSIZE) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; fh_init(fhp, NFS3_FHSIZE); - size = ntohl(*p++); - if (size > NFS3_FHSIZE) - return NULL; - - memcpy(&fhp->fh_handle.fh_base, p, size); fhp->fh_handle.fh_size = size; - return p + XDR_QUADLEN(size); -} + memcpy(&fhp->fh_handle.fh_base, p, size); -/* Helper function for NFSv3 ACL code */ -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); + return true; } static __be32 * @@ -76,69 +92,165 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + XDR_QUADLEN(size); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len) { - char *name; - unsigned int i; + u32 size, i; + __be32 *p; + char *c; - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; + + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) { + if (*c == '\0' || *c == '/') + return false; } - return p; + return true; } -static __be32 * -decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) +static bool +svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) { - u32 tmp; + return svcxdr_decode_nfs_fh3(xdr, fhp) && + svcxdr_decode_filename3(xdr, name, len); +} + +static bool +svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 set_it; iap->ia_valid = 0; - if (*p++) { + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 mode; + + if (xdr_stream_decode_u32(xdr, &mode) < 0) + return false; iap->ia_valid |= ATTR_MODE; - iap->ia_mode = ntohl(*p++); + iap->ia_mode = mode; } - if (*p++) { - iap->ia_uid = make_kuid(userns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 uid; + + if (xdr_stream_decode_u32(xdr, &uid) < 0) + return false; + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if (*p++) { - iap->ia_gid = make_kgid(userns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 gid; + + if (xdr_stream_decode_u32(xdr, &gid) < 0) + return false; + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if (*p++) { - u64 newsize; + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u64 newsize; + if (xdr_stream_decode_u64(xdr, &newsize) < 0) + return false; iap->ia_valid |= ATTR_SIZE; - p = xdr_decode_hyper(p, &newsize); iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_ATIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime)) + return false; iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = ntohl(*p++); - iap->ia_atime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_MTIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime)) + return false; iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = ntohl(*p++); - iap->ia_mtime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - return p; + + return true; +} + +static bool +svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) +{ + __be32 *p; + u32 check; + + if (xdr_stream_decode_bool(xdr, &check) < 0) + return false; + if (check) { + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->check_guard = 1; + args->guardtime = be32_to_cpup(p); + } else + args->check_guard = 0; + + return true; +} + +static bool +svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->major = be32_to_cpup(p++); + args->minor = be32_to_cpup(p); + + return true; +} + +static bool +svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct nfsd3_mknodargs *args) +{ + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_specdata3(xdr, args); } static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) @@ -252,6 +364,11 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) return encode_post_op_attr(rqstp, p, fhp); } +static bool fs_supports_change_attribute(struct super_block *sb) +{ + return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; +} + /* * Fill in the pre_op attr for the wcc data */ @@ -260,24 +377,26 @@ void fill_pre_wcc(struct svc_fh *fhp) struct inode *inode; struct kstat stat; bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; inode = d_inode(fhp->fh_dentry); - err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &stat); + + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; } @@ -288,7 +407,6 @@ void fill_post_wcc(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); - __be32 err; if (fhp->fh_no_wcc) return; @@ -296,12 +414,16 @@ void fill_post_wcc(struct svc_fh *fhp) if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } else - fhp->fh_post_saved = true; + fhp->fh_post_saved = true; + + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); + + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } + } if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); @@ -312,331 +434,277 @@ void fill_post_wcc(struct svc_fh *fhp) */ int -nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh); } int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if ((args->check_guard = ntohl(*p++)) != 0) { - struct timespec64 time; - p = decode_time3(p, &time); - args->guardtime = time.tv_sec; - } - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_sattrguard3(xdr, args); } int nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len); } int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_accessargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - args->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - u32 max_blocksize = svc_max_payload(rqstp); - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - len = min(args->count, max_blocksize); - - /* set up the kvec */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->stable) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - args->stable = ntohl(*p++); - len = args->len = ntohl(*p++); - if ((void *)p > head->iov_base + head->iov_len) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; - /* - * The count must equal the amount of data passed. - */ + + /* request sanity */ if (args->count != args->len) return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) return 0; - if (args->count > max_blocksize) { args->count = max_blocksize; - len = args->len = max_blocksize; + args->len = max_blocksize; } - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + return 1; } int nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) return 0; - - switch (args->createmode = ntohl(*p++)) { + if (xdr_stream_decode_u32(xdr, &args->createmode) < 0) + return 0; + switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - break; + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); case NFS3_CREATE_EXCLUSIVE: - args->verf = p; - p += 2; + args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE); + if (!args->verf) + return 0; break; default: return 0; } - - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) || - !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); } int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t dlen; + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - args->tlen = ntohl(*p++); - - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - dlen = args->first.iov_len + rqstp->rq_arg.page_len + - rqstp->rq_arg.tail[0].iov_len; - if (dlen < XDR_QUADLEN(args->tlen) << 2) + if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs)) return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) + return 0; + + /* request sanity */ + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->tlen)) + return 0; + + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + return 1; } int nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_mknodargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->ftype) < 0) + return 0; + switch (args->ftype) { + case NF3CHR: + case NF3BLK: + return svcxdr_decode_devicedata3(rqstp, xdr, args); + case NF3SOCK: + case NF3FIFO: + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); + case NF3REG: + case NF3DIR: + case NF3LNK: + /* Valid XDR but illegal file types */ + break; + default: return 0; - - args->ftype = ntohl(*p++); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR - || args->ftype == NF3SOCK || args->ftype == NF3FIFO) - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR) { - args->major = ntohl(*p++); - args->minor = ntohl(*p++); } - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); -} - -int -nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd3_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->ffh) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ~0; - args->count = ntohl(*p++); - len = args->count = min_t(u32, args->count, max_blocksize); - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); + u32 dircount; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + /* dircount is ignored */ + if (xdr_stream_decode_u32(xdr, &dircount) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ntohl(*p++); - args->count = ntohl(*p++); - len = args->count = min(args->count, max_blocksize); - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_commitargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + + return 1; } /* diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8d6d2678abad..acdb3cd806a1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -378,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Before RECLAIM_COMPLETE done, server should deny new lock */ if (nfsd4_has_session(cstate) && - !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) return nfserr_grace; @@ -428,8 +427,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(&open->op_clientid, - cstate, nn); + status = nfs4_check_open_reclaim(cstate->clp); if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; @@ -1888,7 +1886,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - rqstp, cstate->session->se_client, gdp); + rqstp, cstate->clp, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -2174,7 +2172,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp) static inline void nfsd4_increment_op_stats(u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - nfsdstats.nfs4_opcount[opnum]++; + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -3305,6 +3303,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, + .pc_name = "NULL", }, [NFSPROC4_COMPOUND] = { .pc_func = nfsd4_proc_compound, @@ -3315,6 +3314,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1d2cd6a88f61..423fd6683f3a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3891,6 +3891,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { @@ -3904,12 +3905,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, } status = nfserr_complete_already; - if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags)) + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; - if (is_client_expired(cstate->session->se_client)) + if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -3920,8 +3920,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; - nfsd4_client_record_create(cstate->session->se_client); - inc_reclaim_complete(cstate->session->se_client); + nfsd4_client_record_create(clp); + inc_reclaim_complete(clp); out: return status; } @@ -4633,40 +4633,37 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } -static __be32 lookup_clientid(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn, - bool sessions) +static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, + struct nfsd_net *nn) { struct nfs4_client *found; + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, sessions, nn); + if (found) + atomic_inc(&found->cl_rpc_users); + spin_unlock(&nn->client_lock); + return found; +} + +static __be32 set_client(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ if (cstate->clp) { - found = cstate->clp; - if (!same_clid(&found->cl_clientid, clid)) + if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } - if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - /* - * For v4.1+ we get the client in the SEQUENCE op. If we don't have one - * cached already then we know this is for is for v4.0 and "sessions" - * will be false. + * We're in the 4.0 case (otherwise the SEQUENCE op would have + * set cstate->clp), so session = false: */ - WARN_ON_ONCE(cstate->session); - spin_lock(&nn->client_lock); - found = find_confirmed_client(clid, sessions, nn); - if (!found) { - spin_unlock(&nn->client_lock); + cstate->clp = lookup_clientid(clid, false, nn); + if (!cstate->clp) return nfserr_expired; - } - atomic_inc(&found->cl_rpc_users); - spin_unlock(&nn->client_lock); - - /* Cache the nfs4_client in cstate! */ - cstate->clp = found; return nfs_ok; } @@ -4680,8 +4677,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_openowner *oo = NULL; __be32 status; - if (STALE_CLIENTID(&open->op_clientid, nn)) - return nfserr_stale_clientid; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: @@ -4690,7 +4685,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - status = lookup_clientid(clientid, cstate, nn, false); + status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; @@ -5300,17 +5295,14 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) - goto out; + return status; clp = cstate->clp; - status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) - goto out; - status = nfs_ok; -out: - return status; + return nfserr_cb_path_down; + return nfs_ok; } void @@ -5686,8 +5678,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn, - false); + status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; @@ -5818,21 +5809,27 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, { __be32 status; struct nfs4_cpntf_state *cps = NULL; - struct nfsd4_compound_state cstate; + struct nfs4_client *found; status = manage_cpntf_state(nn, st, NULL, &cps); if (status) return status; cps->cpntf_time = ktime_get_boottime_seconds(); - memset(&cstate, 0, sizeof(cstate)); - status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true); - if (status) + + status = nfserr_expired; + found = lookup_clientid(&cps->cp_p_clid, true, nn); + if (!found) goto out; - status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - stid, nn); - put_client_renew(cstate.clp); + + *stid = find_stateid_by_type(found, &cps->cp_p_stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + if (*stid) + status = nfs_ok; + else + status = nfserr_bad_stateid; + + put_client_renew(found); out: nfs4_put_cpntf_state(nn, cps); return status; @@ -5921,7 +5918,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = @@ -5967,7 +5964,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); @@ -6696,13 +6693,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, - &cstate->session->se_client->cl_clientid, + &cstate->clp->cl_clientid, sizeof(clientid_t)); - status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) - goto out; - /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, @@ -6909,8 +6902,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, cstate, nn, - false); + status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -7094,7 +7086,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) return status; @@ -7230,25 +7222,13 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) return NULL; } -/* -* Called from OPEN. Look for clientid in reclaim list. -*/ __be32 -nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn) +nfs4_check_open_reclaim(struct nfs4_client *clp) { - __be32 status; - - /* find clientid in conf_id_hashtbl */ - status = lookup_clientid(clid, cstate, nn, false); - if (status) - return nfserr_reclaim_bad; - - if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; - if (nfsd4_client_record_check(cstate->clp)) + if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80c90fc231a5..96cdf77925f3 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -121,14 +121,14 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, struct nfsd_net *nn) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nn->drc_mem_usage -= rp->c_replvec.iov_len; + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); kfree(rp->c_replvec.iov_base); } if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); - nn->drc_mem_usage -= sizeof(*rp); + nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } kmem_cache_free(drc_slab, rp); } @@ -154,6 +154,16 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } +static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +{ + return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); +} + +static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +{ + nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); +} + int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -165,12 +175,16 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); + status = nfsd_reply_cache_stats_init(nn); + if (status) + goto out_nomem; + nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker); if (status) - goto out_nomem; + goto out_stats_destroy; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -186,6 +200,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); +out_stats_destroy: + nfsd_reply_cache_stats_destroy(nn); out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; @@ -196,6 +212,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; + nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -324,7 +341,7 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { - ++nn->payload_misses; + nfsd_stats_payload_misses_inc(nn); trace_nfsd_drc_mismatch(nn, key, rp); } @@ -407,7 +424,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { - nfsdstats.rcnocache++; + nfsd_stats_rc_nocache_inc(); goto out; } @@ -429,12 +446,12 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) goto found_entry; } - nfsdstats.rcmisses++; + nfsd_stats_rc_misses_inc(); rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; atomic_inc(&nn->num_drc_entries); - nn->drc_mem_usage += sizeof(*rp); + nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); /* go ahead and prune the cache */ prune_bucket(b, nn); @@ -446,7 +463,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found_entry: /* We found a matching entry which is either in progress or done. */ - nfsdstats.rchits++; + nfsd_stats_rc_hits_inc(); rtn = RC_DROPIT; /* Request being processed */ @@ -548,7 +565,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) return; } spin_lock(&b->cache_lock); - nn->drc_mem_usage += bufsize; + nfsd_stats_drc_mem_usage_add(nn, bufsize); lru_put_end(b, rp); rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; @@ -588,13 +605,18 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", - atomic_read(&nn->num_drc_entries)); + atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); - seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage); - seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); - seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); - seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); - seq_printf(m, "payload misses: %u\n", nn->payload_misses); + seq_printf(m, "mem usage: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + seq_printf(m, "cache hits: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + seq_printf(m, "cache misses: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + seq_printf(m, "not cached: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + seq_printf(m, "payload misses: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f6d5d783f4a4..4f6e514192bd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -32,6 +32,7 @@ enum { NFSD_Root = 1, NFSD_List, + NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, @@ -1348,6 +1349,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + /* Per-export io stats use same ops as exports file */ + [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", @@ -1534,7 +1537,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - nfsd_stat_init(); /* Statistics */ + retval = nfsd_stat_init(); /* Statistics */ + if (retval) + goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) goto out_free_stat; @@ -1554,6 +1559,7 @@ static int __init init_nfsd(void) nfsd_drc_slab_free(); out_free_stat: nfsd_stat_shutdown(); +out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d63cf8196fed..8bdc37aa2c2e 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,8 +24,8 @@ #include #include "netns.h" -#include "stats.h" #include "export.h" +#include "stats.h" #undef ifdebug #ifdef CONFIG_SUNRPC_DEBUG diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 66f2ef67792a..4744a276058d 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -349,7 +349,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { - struct svc_export *exp; + struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -422,7 +422,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) } out: if (error == nfserr_stale) - nfsdstats.fh_stale++; + nfsd_stats_fh_stale_inc(exp); return error; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cb20c2cd3469..f58933519f38 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -12,6 +12,7 @@ #include #include #include +#include static inline __u32 ino_t_to_u32(ino_t ino) { @@ -264,7 +265,9 @@ fh_clear_wcc(struct svc_fh *fhp) static inline u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) { - if (IS_I_VERSION(inode)) { + if (inode->i_sb->s_export_op->fetch_iversion) + return inode->i_sb->s_export_op->fetch_iversion(inode); + else if (IS_I_VERSION(inode)) { u64 chattr; chattr = stat->ctime.tv_sec; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9473d048efec..b2f8035f166b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -149,14 +149,15 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) static __be32 nfsd_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len); fh_put(&argp->fh); return rpc_success; @@ -171,32 +172,36 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; + unsigned int len; u32 eof; + int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->offset); + argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. */ - - if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { - char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_NOTICE - "oversized read request from %s (%d bytes)\n", - svc_print_addr(rqstp, buf, sizeof(buf)), - argp->count); - argp->count = NFSSVC_MAXBLKSIZE_V2; - } svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count, - &eof); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, v, &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) @@ -548,6 +553,20 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd_readdirres *resp, + int count) +{ + count = min_t(u32, count, PAGE_SIZE); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + rqstp->rq_next_page++; +} + /* * Read a portion of a directory. */ @@ -556,31 +575,24 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) { struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; - int count; loff_t offset; + __be32 *buffer; dprintk("nfsd: READDIR %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->cookie); - /* Shrink to the client read size */ - count = (argp->count >> 2) - 2; + nfsd_init_dirlist_pages(rqstp, resp, argp->count); + buffer = resp->buffer; - /* Make sure we've room for the NULL ptr & eof flag */ - count -= 2; - if (count < 0) - count = 0; - - resp->buffer = argp->buffer; resp->offset = NULL; - resp->buflen = count; resp->common.err = nfs_ok; /* Read directory and encode entries on the fly */ offset = argp->cookie; resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, &resp->common, nfssvc_encode_entry); - resp->count = resp->buffer - argp->buffer; + resp->count = resp->buffer - buffer; if (resp->offset) *resp->offset = htonl(offset); @@ -623,16 +635,18 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "NULL", }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_attrstat, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, @@ -643,6 +657,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "SETATTR", }, [NFSPROC_ROOT] = { .pc_func = nfsd_proc_root, @@ -652,6 +667,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "ROOT", }, [NFSPROC_LOOKUP] = { .pc_func = nfsd_proc_lookup, @@ -662,15 +678,17 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, + .pc_name = "LOOKUP", }, [NFSPROC_READLINK] = { .pc_func = nfsd_proc_readlink, - .pc_decode = nfssvc_decode_readlinkargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, - .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFSPROC_READ] = { .pc_func = nfsd_proc_read, @@ -681,6 +699,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { .pc_func = nfsd_proc_writecache, @@ -690,6 +709,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "WRITECACHE", }, [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, @@ -700,6 +720,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "WRITE", }, [NFSPROC_CREATE] = { .pc_func = nfsd_proc_create, @@ -710,6 +731,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "CREATE", }, [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, @@ -719,6 +741,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "REMOVE", }, [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, @@ -728,6 +751,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RENAME", }, [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, @@ -737,6 +761,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "LINK", }, [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, @@ -746,6 +771,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "SYMLINK", }, [NFSPROC_MKDIR] = { .pc_func = nfsd_proc_mkdir, @@ -756,6 +782,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "MKDIR", }, [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, @@ -765,6 +792,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RMDIR", }, [NFSPROC_READDIR] = { .pc_func = nfsd_proc_readdir, @@ -773,15 +801,17 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argsize = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFSPROC_STATFS] = { .pc_func = nfsd_proc_statfs, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, + .pc_name = "STATFS", }, }; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f9c9f4c63cc7..6de406322106 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -955,37 +955,6 @@ nfsd(void *vrqstp) return 0; } -/* - * A write procedure can have a large argument, and a read procedure can - * have a large reply, but no NFSv2 or NFSv3 procedure has argument and - * reply that can both be larger than a page. The xdr code has taken - * advantage of this assumption to be a sloppy about bounds checking in - * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that - * problem, we enforce these assumptions here: - */ -static bool nfs_request_too_big(struct svc_rqst *rqstp, - const struct svc_procedure *proc) -{ - /* - * The ACL code has more careful bounds-checking and is not - * susceptible to this problem: - */ - if (rqstp->rq_prog != NFS_PROGRAM) - return false; - /* - * Ditto NFSv4 (which can in theory have argument and reply both - * more than a page): - */ - if (rqstp->rq_vers >= 4) - return false; - /* The reply will be small, we're OK: */ - if (proc->pc_xdrressize > 0 && - proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) - return false; - - return rqstp->rq_arg.len > PAGE_SIZE; -} - /** * nfsd_dispatch - Process an NFS or NFSACL Request * @rqstp: incoming request @@ -1004,9 +973,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; - if (nfs_request_too_big(rqstp, proc)) - goto out_decode_err; - /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 7aa6e8aca2c1..5d79ef6a0c7f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -23,24 +23,31 @@ static u32 nfs_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv2 data types (RFC 1094 Section 2.3) */ -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) + +/** + * svcxdr_decode_fhandle - Decode an NFSv2 file handle + * @xdr: XDR stream positioned at an encoded NFSv2 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) { + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_FHSIZE); + if (!p) + return false; fh_init(fhp, NFS_FHSIZE); memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; - /* FIXME: Look up export pointer here and verify - * Sun Secure RPC if requested */ - return p + (NFS_FHSIZE >> 2); -} - -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); + return true; } static __be32 * @@ -50,66 +57,95 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE>> 2); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len) { - char *name; - unsigned int i; + u32 size, i; + __be32 *p; + char *c; - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } - } + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; - return p; + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) + if (*c == '\0' || *c == '/') + return false; + + return true; } -static __be32 * -decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) +static bool +svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) { - u32 tmp, tmp1; + return svcxdr_decode_fhandle(xdr, fhp) && + svcxdr_decode_filename(xdr, name, len); +} + +static bool +svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 tmp1, tmp2; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 8); + if (!p) + return false; iap->ia_valid = 0; - /* Sun client bug compatibility check: some sun clients seem to - * put 0xffff in the mode field when they mean 0xffffffff. - * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + /* + * Some Sun clients put 0xffff in the mode field when they + * mean 0xffffffff. */ - if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp1 != 0xffff) { iap->ia_valid |= ATTR_MODE; - iap->ia_mode = tmp; + iap->ia_mode = tmp1; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_uid = make_kuid(userns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_gid = make_kgid(userns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { iap->ia_valid |= ATTR_SIZE; - iap->ia_size = tmp; + iap->ia_size = tmp1; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = tmp; - iap->ia_atime.tv_nsec = tmp1 * 1000; + iap->ia_atime.tv_sec = tmp1; + iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = tmp; - iap->ia_mtime.tv_nsec = tmp1 * 1000; + iap->ia_mtime.tv_sec = tmp1; + iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC; /* * Passing the invalid value useconds=1000000 for mtime * is a Sun convention for "set both mtime and atime to @@ -119,10 +155,11 @@ decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) * sattr in section 6.1 of "NFS Illustrated" by * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 */ - if (tmp1 == 1000000) + if (tmp2 == 1000000) iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); } - return p; + + return true; } static __be32 * @@ -194,225 +231,158 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f */ int -nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh); } int nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len); } int nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - p = decode_fh(p, &args->fh); - if (!p) + u32 totalcount; + + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) return 0; - args->offset = ntohl(*p++); - len = args->count = ntohl(*p++); - p++; /* totalcount - unused */ - - len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); - - /* set up somewhere to store response. - * We take pages, put them on reslist and include in iovec - */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + u32 beginoffset, totalcount; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + /* beginoffset is ignored */ + if (xdr_stream_decode_u32(xdr, &beginoffset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) return 0; - p++; /* beginoffset */ - args->offset = ntohl(*p++); /* offset */ - p++; /* totalcount */ - len = args->len = ntohl(*p++); - /* - * The protocol specifies a maximum of 8192 bytes. - */ - if (len > NFSSVC_MAXBLKSIZE_V2) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - if (hdr > head->iov_len) + if (args->len > NFSSVC_MAXBLKSIZE_V2) return 0; - dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; - - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) return 0; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; return 1; } int nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_createargs *args = rqstp->rq_argp; - if ( !(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); -} - -int -nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->ffh) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t xdrlen; + struct kvec *head = rqstp->rq_arg.head; - if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return 0; - - args->tlen = ntohl(*p++); if (args->tlen == 0) return 0; - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - /* This request is never larger than a page. Therefore, - * transport will deliver either: - * 1. pathname in the pagelist -> sattr is in the tail. - * 2. everything in the head buffer -> sattr is in the head. - */ - if (rqstp->rq_arg.page_len) { - if (args->tlen != rqstp->rq_arg.page_len) - return 0; - p = rqstp->rq_arg.tail[0].iov_base; - } else { - xdrlen = XDR_QUADLEN(args->tlen); - if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) - return 0; - p += xdrlen; - } - decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return 1; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + if (!args->first.iov_base) + return 0; + return svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->cookie) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - args->cookie = ntohl(*p++); - args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9eae11a9d21c..73deea353169 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -649,8 +649,7 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *) extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index b1bc582b0493..1d3b881e7382 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -7,16 +7,14 @@ * Format: * rc * Statistsics for the reply cache - * fh + * fh * statistics for filehandle lookup * io * statistics for IO throughput - * th <10%-20%> <20%-30%> ... <90%-100%> <100%> - * time (seconds) when nfsd thread usage above thresholds - * and number of times that all threads were in use - * ra cache-size <10% <20% <30% ... <100% not-found - * number of times that read-ahead entry was found that deep in - * the cache. + * th + * number of threads + * ra + * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch @@ -38,31 +36,24 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) { int i; - seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", - nfsdstats.rchits, - nfsdstats.rcmisses, - nfsdstats.rcnocache, - nfsdstats.fh_stale, - nfsdstats.fh_lookup, - nfsdstats.fh_anon, - nfsdstats.fh_nocache_dir, - nfsdstats.fh_nocache_nondir, - nfsdstats.io_read, - nfsdstats.io_write); - /* thread usage: */ - seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); - for (i=0; i<10; i++) { - unsigned int jifs = nfsdstats.th_usage[i]; - unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; - seq_printf(seq, " %u.%03u", sec, msec); - } + seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + + /* thread usage: */ + seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + + /* deprecated thread usage histogram stats */ + for (i = 0; i < 10; i++) + seq_puts(seq, " 0.000"); + + /* deprecated ra-cache stats */ + seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); - /* newline and ra-cache */ - seq_printf(seq, "\nra %u", nfsdstats.ra_size); - for (i=0; i<11; i++) - seq_printf(seq, " %u", nfsdstats.ra_depth[i]); - seq_putc(seq, '\n'); - /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); @@ -70,8 +61,10 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); - for (i = 0; i <= LAST_NFS4_OP; i++) - seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + for (i = 0; i <= LAST_NFS4_OP; i++) { + seq_printf(seq, " %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + } seq_putc(seq, '\n'); #endif @@ -91,14 +84,63 @@ static const struct proc_ops nfsd_proc_ops = { .proc_release = single_release, }; -void -nfsd_stat_init(void) +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); + int i, err = 0; + + for (i = 0; !err && i < num; i++) + err = percpu_counter_init(&counters[i], 0, GFP_KERNEL); + + if (!err) + return 0; + + for (; i > 0; i--) + percpu_counter_destroy(&counters[i-1]); + + return err; } -void -nfsd_stat_shutdown(void) +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num) { + int i; + + for (i = 0; i < num; i++) + percpu_counter_set(&counters[i], 0); +} + +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) +{ + int i; + + for (i = 0; i < num; i++) + percpu_counter_destroy(&counters[i]); +} + +static int nfsd_stat_counters_init(void) +{ + return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +static void nfsd_stat_counters_destroy(void) +{ + nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +int nfsd_stat_init(void) +{ + int err; + + err = nfsd_stat_counters_init(); + if (err) + return err; + + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); + + return 0; +} + +void nfsd_stat_shutdown(void) +{ + nfsd_stat_counters_destroy(); svc_proc_unregister(&init_net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index b23fdac69820..51ecda852e23 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -8,37 +8,91 @@ #define _NFSD_STATS_H #include +#include +enum { + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) +#endif + NFSD_STATS_COUNTERS_NUM +}; + struct nfsd_stats { - unsigned int rchits; /* repcache hits */ - unsigned int rcmisses; /* repcache hits */ - unsigned int rcnocache; /* uncached reqs */ - unsigned int fh_stale; /* FH stale error */ - unsigned int fh_lookup; /* dentry cached */ - unsigned int fh_anon; /* anon file dentry returned */ - unsigned int fh_nocache_dir; /* filehandle not found in dcache */ - unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ - unsigned int io_read; /* bytes returned to read requests */ - unsigned int io_write; /* bytes passed in write requests */ - unsigned int th_cnt; /* number of available threads */ - unsigned int th_usage[10]; /* number of ticks during which n perdeciles - * of available threads were in use */ - unsigned int th_fullcnt; /* number of times last free thread was used */ - unsigned int ra_size; /* size of ra cache */ - unsigned int ra_depth[11]; /* number of times ra entry was found that deep - * in the cache (10percentiles). [10] = not found */ -#ifdef CONFIG_NFSD_V4 - unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ -#endif + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + /* Protected by nfsd_mutex */ + unsigned int th_cnt; /* number of available threads */ }; extern struct nfsd_stats nfsdstats; + extern struct svc_stat nfsd_svcstats; -void nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +static inline void nfsd_stats_rc_hits_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); +} + +static inline void nfsd_stats_rc_misses_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); +} + +static inline void nfsd_stats_rc_nocache_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); +} + +static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + if (exp) + percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); +} + +static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); +} + +static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); +} + +static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); +} + +static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} + +static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 04937e51de56..d316e11923c5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -889,7 +889,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsdstats.io_read += host_err; + nfsd_stats_io_read_add(fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1040,7 +1040,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsdstats.io_write += *cnt; + nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); if (stable && use_wgather) { diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ad77387734cc..3018b52b6d5e 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -27,7 +27,6 @@ struct nfsd_readargs { struct svc_fh fh; __u32 offset; __u32 count; - int vlen; }; struct nfsd_writeargs { @@ -53,11 +52,6 @@ struct nfsd_renameargs { unsigned int tlen; }; -struct nfsd_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -79,7 +73,6 @@ struct nfsd_readdirargs { struct svc_fh fh; __u32 cookie; __u32 count; - __be32 * buffer; }; struct nfsd_stat { @@ -144,14 +137,13 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *); int nfssvc_decode_createargs(struct svc_rqst *, __be32 *); int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); @@ -172,6 +164,6 @@ void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); +bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 456fcd7a1038..3e1578953f54 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -25,14 +25,13 @@ struct nfsd3_diropargs { struct nfsd3_accessargs { struct svc_fh fh; - unsigned int access; + __u32 access; }; struct nfsd3_readargs { struct svc_fh fh; __u64 offset; __u32 count; - int vlen; }; struct nfsd3_writeargs { @@ -71,11 +70,6 @@ struct nfsd3_renameargs { unsigned int tlen; }; -struct nfsd3_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -96,10 +90,8 @@ struct nfsd3_symlinkargs { struct nfsd3_readdirargs { struct svc_fh fh; __u64 cookie; - __u32 dircount; __u32 count; __be32 * verf; - __be32 * buffer; }; struct nfsd3_commitargs { @@ -110,13 +102,13 @@ struct nfsd3_commitargs { struct nfsd3_getaclargs { struct svc_fh fh; - int mask; + __u32 mask; }; struct posix_acl; struct nfsd3_setaclargs { struct svc_fh fh; - int mask; + __u32 mask; struct posix_acl *acl_access; struct posix_acl *acl_default; }; @@ -273,7 +265,7 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *); @@ -283,7 +275,6 @@ int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); @@ -316,7 +307,6 @@ int nfs3svc_encode_entry_plus(void *, const char *name, /* Helper functions for NFSv3 ACL code */ __be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); - +bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 9f4d4bcbf251..fe848901fcc3 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,6 +213,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + u64 (*fetch_iversion)(struct inode *); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 7dc2a06cf19a..c6cc0a566ef5 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -274,6 +274,9 @@ struct f2fs_inode { __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ __le16 i_compress_flag; /* compress flag */ + /* 0 bit: chksum flag + * [10,15] bits: compress level + */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 647c35423545..933eaf218738 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3192,11 +3192,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -#ifdef CONFIG_UNICODE -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name); -#endif extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h index 103d44695323..0ba99c513649 100644 --- a/include/linux/nfsacl.h +++ b/include/linux/nfsacl.h @@ -38,5 +38,8 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, extern int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, struct posix_acl **pacl); +extern bool +nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt, + struct posix_acl **pacl); #endif /* __LINUX_NFSACL_H */ diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 43f854487539..938c2bf29db8 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -10,9 +10,6 @@ #define RPC_VERSION 2 -/* size of an XDR encoding unit in bytes, i.e. 32bit */ -#define XDR_UNIT (4) - /* spec defines authentication flavor as an unsigned 32 bit integer */ typedef u32 rpc_authflavor_t; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 34c2a69820e9..31ee3b6047c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -463,6 +463,7 @@ struct svc_procedure { unsigned int pc_ressize; /* result struct size */ unsigned int pc_cachetype; /* cache info (NFS) */ unsigned int pc_xdrressize; /* maximum size of XDR reply */ + const char * pc_name; /* for display */ }; /* diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 294b56e61522..7c693b31965e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -65,15 +66,10 @@ extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; -extern atomic_t rdma_stat_recv; -extern atomic_t rdma_stat_read; -extern atomic_t rdma_stat_write; -extern atomic_t rdma_stat_sq_starve; -extern atomic_t rdma_stat_rq_starve; -extern atomic_t rdma_stat_rq_poll; -extern atomic_t rdma_stat_rq_prod; -extern atomic_t rdma_stat_sq_poll; -extern atomic_t rdma_stat_sq_prod; +extern struct percpu_counter svcrdma_stat_read; +extern struct percpu_counter svcrdma_stat_recv; +extern struct percpu_counter svcrdma_stat_sq_starve; +extern struct percpu_counter svcrdma_stat_write; struct svcxprt_rdma { struct svc_xprt sc_xprt; /* SVC transport structure */ @@ -108,6 +104,7 @@ struct svcxprt_rdma { wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ unsigned long sc_flags; + u32 sc_pending_recvs; struct list_head sc_read_complete_q; struct work_struct sc_work; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b26213ae8c1a..2bc75c167f00 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -19,6 +19,13 @@ struct bio_vec; struct rpc_rqst; +/* + * Size of an XDR encoding unit in bytes, i.e. 32 bits, + * as defined in Section 3 of RFC 4506. All encoded + * XDR data items are aligned on a boundary of 32 bits. + */ +#define XDR_UNIT sizeof(__be32) + /* * Buffer adjustment */ @@ -329,7 +336,7 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, static inline size_t xdr_align_size(size_t n) { - const size_t mask = sizeof(__u32) - 1; + const size_t mask = XDR_UNIT - 1; return (n + mask) & ~mask; } @@ -359,7 +366,7 @@ static inline size_t xdr_pad_size(size_t n) */ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) { - const size_t len = sizeof(__be32); + const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) @@ -378,7 +385,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) */ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) { - const size_t len = sizeof(__be32); + const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 6f89c27265f5..036eb1f5c133 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1603,6 +1603,7 @@ TRACE_EVENT(svc_process, __field(u32, vers) __field(u32, proc) __string(service, name) + __string(procedure, rqst->rq_procinfo->pc_name) __string(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)") ), @@ -1612,13 +1613,16 @@ TRACE_EVENT(svc_process, __entry->vers = rqst->rq_vers; __entry->proc = rqst->rq_proc; __assign_str(service, name); + __assign_str(procedure, rqst->rq_procinfo->pc_name); __assign_str(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)"); ), - TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u", + TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s", __get_str(addr), __entry->xid, - __get_str(service), __entry->vers, __entry->proc) + __get_str(service), __entry->vers, + __get_str(procedure) + ) ); DECLARE_EVENT_CLASS(svc_rqst_event, @@ -1874,6 +1878,7 @@ TRACE_EVENT(svc_stats_latency, TP_STRUCT__entry( __field(u32, xid) __field(unsigned long, execute) + __string(procedure, rqst->rq_procinfo->pc_name) __string(addr, rqst->rq_xprt->xpt_remotebuf) ), @@ -1881,11 +1886,13 @@ TRACE_EVENT(svc_stats_latency, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->execute = ktime_to_us(ktime_sub(ktime_get(), rqst->rq_stime)); + __assign_str(procedure, rqst->rq_procinfo->pc_name); __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("addr=%s xid=0x%08x execute-us=%lu", - __get_str(addr), __entry->xid, __entry->execute) + TP_printk("addr=%s xid=0x%08x proc=%s execute-us=%lu", + __get_str(addr), __entry->xid, __get_str(procedure), + __entry->execute) ); DECLARE_EVENT_CLASS(svc_deferred_event, diff --git a/include/uapi/linux/nfs3.h b/include/uapi/linux/nfs3.h index 37e4b34e6b43..c22ab77713bd 100644 --- a/include/uapi/linux/nfs3.h +++ b/include/uapi/linux/nfs3.h @@ -63,6 +63,12 @@ enum nfs3_ftype { NF3BAD = 8 }; +enum nfs3_time_how { + DONT_CHANGE = 0, + SET_TO_SERVER_TIME = 1, + SET_TO_CLIENT_TIME = 2, +}; + struct nfs3_fh { unsigned short size; unsigned char data[NFS3_FHSIZE]; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4187745887f0..61fb8a18552c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -559,7 +559,7 @@ EXPORT_SYMBOL_GPL(svc_destroy); /* * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_argpages. + * We allocate pages and place them in rq_pages. */ static int svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 526da5d4710b..5bc20e9d09cd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -62,51 +62,47 @@ static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +static unsigned int svcrdma_stat_unused; +static unsigned int zero; -atomic_t rdma_stat_recv; -atomic_t rdma_stat_read; -atomic_t rdma_stat_write; -atomic_t rdma_stat_sq_starve; -atomic_t rdma_stat_rq_starve; -atomic_t rdma_stat_rq_poll; -atomic_t rdma_stat_rq_prod; -atomic_t rdma_stat_sq_poll; -atomic_t rdma_stat_sq_prod; +struct percpu_counter svcrdma_stat_read; +struct percpu_counter svcrdma_stat_recv; +struct percpu_counter svcrdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_write; -/* - * This function implements reading and resetting an atomic_t stat - * variable through read/write to a proc file. Any write to the file - * resets the associated statistic to zero. Any read returns it's - * current value. - */ -static int read_reset_stat(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +enum { + SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), +}; + +static int svcrdma_counter_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - atomic_t *stat = (atomic_t *)table->data; + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; + int len; - if (!stat) - return -EINVAL; - - if (write) - atomic_set(stat, 0); - else { - char str_buf[32]; - int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); - if (len >= 32) - return -EFAULT; - len = strlen(str_buf); - if (*ppos > len) { - *lenp = 0; - return 0; - } - len -= *ppos; - if (len > *lenp) - len = *lenp; - if (len) - memcpy(buffer, str_buf, len); - *lenp = len; - *ppos += len; + if (write) { + percpu_counter_set(stat, 0); + return 0; } + + len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n", + percpu_counter_sum_positive(stat)); + if (len >= SVCRDMA_COUNTER_BUFSIZ) + return -EFAULT; + len = strlen(tmp); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + *lenp = len; + *ppos += len; + return 0; } @@ -142,66 +138,76 @@ static struct ctl_table svcrdma_parm_table[] = { { .procname = "rdma_stat_read", - .data = &rdma_stat_read, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_read, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_recv", - .data = &rdma_stat_recv, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_recv, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_write", - .data = &rdma_stat_write, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_write, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_sq_starve", - .data = &rdma_stat_sq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_sq_starve, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_rq_starve", - .data = &rdma_stat_rq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_poll", - .data = &rdma_stat_rq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_prod", - .data = &rdma_stat_rq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_poll", - .data = &rdma_stat_sq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_prod", - .data = &rdma_stat_sq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { }, }; @@ -224,27 +230,69 @@ static struct ctl_table svcrdma_root_table[] = { { }, }; +static void svc_rdma_proc_cleanup(void) +{ + if (!svcrdma_table_header) + return; + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + + percpu_counter_destroy(&svcrdma_stat_write); + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); +} + +static int svc_rdma_proc_init(void) +{ + int rc; + + if (svcrdma_table_header) + return 0; + + rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); + if (rc) + goto out_err; + + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + return 0; + +out_err: + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); + return rc; +} + void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - if (svcrdma_table_header) { - unregister_sysctl_table(svcrdma_table_header); - svcrdma_table_header = NULL; - } svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); } int svc_rdma_init(void) { + int rc; + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - if (!svcrdma_table_header) - svcrdma_table_header = - register_sysctl_table(svcrdma_root_table); + rc = svc_rdma_proc_init(); + if (rc) + return rc; /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index cbdb71247755..6d28f23ceb35 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -266,33 +266,46 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma, ctxt); } -static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) +static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) { + const struct ib_recv_wr *bad_wr = NULL; + struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *recv_chain; int ret; - trace_svcrdma_post_recv(ctxt); - ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); + recv_chain = NULL; + while (wanted--) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + break; + + trace_svcrdma_post_recv(ctxt); + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; + rdma->sc_pending_recvs++; + } + if (!recv_chain) + return false; + + ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) goto err_post; - return 0; + return true; err_post: + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); + bad_wr = bad_wr->next; + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + trace_svcrdma_rq_post_err(rdma, ret); - svc_rdma_recv_ctxt_put(rdma, ctxt); - return ret; -} - -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_recv_ctxt *ctxt; - - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return 0; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - return __svc_rdma_post_recv(rdma, ctxt); + /* Since we're destroying the xprt, no need to reset + * sc_pending_recvs. */ + return false; } /** @@ -303,20 +316,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - struct svc_rdma_recv_ctxt *ctxt; - unsigned int i; - int ret; - - for (i = 0; i < rdma->sc_max_requests; i++) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return false; - ctxt->rc_temp = true; - ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) - return false; - } - return true; + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); } /** @@ -324,8 +324,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * - * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that - * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -333,6 +331,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; + rdma->sc_pending_recvs--; + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); @@ -340,14 +340,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto flushed; - if (svc_rdma_post_recv(rdma)) - goto post_err; - /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - wc->byte_len, DMA_FROM_DEVICE); spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); @@ -356,11 +350,18 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); + + if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) && + rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH, + false)) + goto post_err; + return; flushed: -post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); +post_err: set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } @@ -845,9 +846,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); + percpu_counter_inc(&svcrdma_stat_recv); - atomic_inc(&rdma_stat_recv); - + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); /* Prevent svc_xprt_release from releasing pages in rq_pages diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 0b63e1321d74..693d139a8633 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -364,6 +364,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) return 0; } + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -468,6 +469,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, DMA_TO_DEVICE); if (ret < 0) return -EIO; + percpu_counter_inc(&svcrdma_stat_write); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; @@ -718,6 +720,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; + percpu_counter_inc(&svcrdma_stat_read); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 68af79d4f04f..52c759a8543e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -317,7 +317,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait,